• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/tile.h"
16 
17 #include <algorithm>
18 #include <array>
19 #include <cassert>
20 #include <climits>
21 #include <condition_variable>  // NOLINT (unapproved c++11 header)
22 #include <cstdlib>
23 #include <cstring>
24 #include <memory>
25 #include <mutex>  // NOLINT (unapproved c++11 header)
26 #include <new>
27 #include <numeric>
28 #include <type_traits>
29 #include <utility>
30 
31 #include "src/frame_scratch_buffer.h"
32 #include "src/motion_vector.h"
33 #include "src/reconstruction.h"
34 #include "src/utils/bit_mask_set.h"
35 #include "src/utils/common.h"
36 #include "src/utils/constants.h"
37 #include "src/utils/logging.h"
38 #include "src/utils/segmentation.h"
39 #include "src/utils/stack.h"
40 
41 namespace libgav1 {
42 namespace {
43 
44 // Import all the constants in the anonymous namespace.
45 #include "src/scan_tables.inc"
46 
47 // Range above kNumQuantizerBaseLevels which the exponential golomb coding
48 // process is activated.
49 constexpr int kQuantizerCoefficientBaseRange = 12;
50 constexpr int kNumQuantizerBaseLevels = 2;
51 constexpr int kCoeffBaseRangeMaxIterations =
52     kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
53 constexpr int kEntropyContextLeft = 0;
54 constexpr int kEntropyContextTop = 1;
55 
56 constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
57                                                      {2, 4, 4, 4, 5},
58                                                      {2, 4, 4, 4, 5},
59                                                      {2, 4, 4, 4, 5},
60                                                      {3, 5, 5, 5, 6}};
61 
62 // The space complexity of DFS is O(branching_factor * max_depth). For the
63 // parameter tree, branching_factor = 4 (there could be up to 4 children for
64 // every node) and max_depth (excluding the root) = 5 (to go from a 128x128
65 // block all the way to a 4x4 block). The worse-case stack size is 16, by
66 // counting the number of 'o' nodes in the diagram:
67 //
68 //   |                    128x128  The highest level (corresponding to the
69 //   |                             root of the tree) has no node in the stack.
70 //   |-----------------+
71 //   |     |     |     |
72 //   |     o     o     o  64x64
73 //   |
74 //   |-----------------+
75 //   |     |     |     |
76 //   |     o     o     o  32x32    Higher levels have three nodes in the stack,
77 //   |                             because we pop one node off the stack before
78 //   |-----------------+           pushing its four children onto the stack.
79 //   |     |     |     |
80 //   |     o     o     o  16x16
81 //   |
82 //   |-----------------+
83 //   |     |     |     |
84 //   |     o     o     o  8x8
85 //   |
86 //   |-----------------+
87 //   |     |     |     |
88 //   o     o     o     o  4x4      Only the lowest level has four nodes in the
89 //                                 stack.
90 constexpr int kDfsStackSize = 16;
91 
92 // Mask indicating whether the transform sets contain a particular transform
93 // type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
94 constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
95     BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
96     BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
97 
98 constexpr PredictionMode
99     kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
100         kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
101         kPredictionModeD157, kPredictionModeDc};
102 
103 // Mask used to determine the index for mode_deltas lookup.
104 constexpr BitMaskSet kPredictionModeDeltasMask(
105     kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
106     kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
107     kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
108     kPredictionModeNearNewMv, kPredictionModeNewNearMv,
109     kPredictionModeNewNewMv);
110 
111 // This is computed as:
112 // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
113 constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
114     0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
115 
116 /* clang-format off */
117 constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
118     {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
119      {0, 0, 0, 0, 0}},
120     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
121      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
122     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
123      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
124     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
125      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
126     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
127      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
128     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
129      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
130     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
131      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
132     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
133      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
134     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
135      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
136     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
137      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
138     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
139      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
140     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
141      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
142     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
143      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
144     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
145      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
146     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
147      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
148     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
149      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
150     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
151      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
152     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
153      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
154     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
155      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
156 /* clang-format on */
157 
158 // Extended the table size from 3 to 16 by repeating the last element to avoid
159 // the clips to row or column indices.
160 constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
161     26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
162 
163 constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
164     kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
165     kPredictionModeSmooth};
166 
167 // Number of horizontal luma samples before intra block copy can be used.
168 constexpr int kIntraBlockCopyDelayPixels = 256;
169 // Number of 64 by 64 blocks before intra block copy can be used.
170 constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
171 
172 // Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
173 // height 1 << (j + 2).
174 constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
175     {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
176      kNumTransformSizes, kNumTransformSizes},
177     {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
178      kTransformSize8x32, kNumTransformSizes},
179     {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
180      kTransformSize16x32, kTransformSize16x64},
181     {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
182      kTransformSize32x32, kTransformSize32x64},
183     {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
184      kTransformSize64x32, kTransformSize64x64}};
185 
186 // Defined in section 9.3 of the spec.
187 constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
188     kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
189     kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
190     kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
191     kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
192     kTransformTypeAdstAdst, kTransformTypeDctDct};
193 
194 // Defined in section 5.11.47 of the spec. This array does not contain an entry
195 // for kTransformSetDctOnly, so the first dimension needs to be
196 // |kNumTransformSets| - 1.
197 constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
198     {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
199       kTransformTypeIdentityDct, kTransformTypeDctIdentity,
200       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
201      {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
202       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
203      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
204       kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
205       kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
206       kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
207       kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
208       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
209       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
210       kTransformTypeAdstFlipadst},
211      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
212       kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
213       kTransformTypeAdstDct, kTransformTypeDctFlipadst,
214       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
215       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
216       kTransformTypeAdstFlipadst},
217      {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
218 
219 // Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
220 constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
221     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
222     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
223     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
224     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
225     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
226     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
227     kTransformSize32x32};
228 
229 // This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
230 // transforms replaced with *x32 and 32x* respectively.
231 constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
232     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
233     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
234     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
235     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
236     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
237     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
238     kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
239     kTransformSize32x32};
240 
241 // ith entry of this array is computed as:
242 // DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
243 //           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
244 //           1)
245 constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
246     0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
247 
248 constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
249 
250 constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
251 
252 // Maps compound prediction modes into single modes. For e.g.
253 // kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
254 // and kPredictionModeNewMv for index 1. It is used to simplify the logic in
255 // AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
256 constexpr PredictionMode
257     kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
258         {kPredictionModeNearestMv, kPredictionModeNearestMv},
259         {kPredictionModeNearMv, kPredictionModeNearMv},
260         {kPredictionModeNearestMv, kPredictionModeNewMv},
261         {kPredictionModeNewMv, kPredictionModeNearestMv},
262         {kPredictionModeNearMv, kPredictionModeNewMv},
263         {kPredictionModeNewMv, kPredictionModeNearMv},
264         {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
265         {kPredictionModeNewMv, kPredictionModeNewMv},
266 };
GetSinglePredictionMode(int index,PredictionMode y_mode)267 PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
268   if (y_mode < kPredictionModeNearestNearestMv) {
269     return y_mode;
270   }
271   const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
272   assert(lookup_index >= 0);
273   return kCompoundToSinglePredictionMode[lookup_index][index];
274 }
275 
276 // log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
277 // dqDenom is always a power of two and hence right shift can be used instead of
278 // division.
279 constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
280     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
281 
282 // Returns the minimum of |length| or |max|-|start|. This is used to clamp array
283 // indices when accessing arrays whose bound is equal to |max|.
GetNumElements(int length,int start,int max)284 int GetNumElements(int length, int start, int max) {
285   return std::min(length, max - start);
286 }
287 
288 template <typename T>
SetBlockValues(int rows,int columns,T value,T * dst,ptrdiff_t stride)289 void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
290   // Specialize all columns cases (values in kTransformWidth4x4[]) for better
291   // performance.
292   switch (columns) {
293     case 1:
294       MemSetBlock<T>(rows, 1, value, dst, stride);
295       break;
296     case 2:
297       MemSetBlock<T>(rows, 2, value, dst, stride);
298       break;
299     case 4:
300       MemSetBlock<T>(rows, 4, value, dst, stride);
301       break;
302     case 8:
303       MemSetBlock<T>(rows, 8, value, dst, stride);
304       break;
305     default:
306       assert(columns == 16);
307       MemSetBlock<T>(rows, 16, value, dst, stride);
308       break;
309   }
310 }
311 
SetTransformType(const Tile::Block & block,int x4,int y4,int w4,int h4,TransformType tx_type,TransformType transform_types[32][32])312 void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
313                       TransformType tx_type,
314                       TransformType transform_types[32][32]) {
315   const int y_offset = y4 - block.row4x4;
316   const int x_offset = x4 - block.column4x4;
317   TransformType* const dst = &transform_types[y_offset][x_offset];
318   SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
319 }
320 
StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,const MotionVector & mv_to_store,ptrdiff_t stride,int rows,int columns,ReferenceFrameType * reference_frame_row_start,MotionVector * mv)321 void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
322                          const MotionVector& mv_to_store, ptrdiff_t stride,
323                          int rows, int columns,
324                          ReferenceFrameType* reference_frame_row_start,
325                          MotionVector* mv) {
326   static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
327   do {
328     // Don't switch the following two memory setting functions.
329     // Some ARM CPUs are quite sensitive to the order.
330     memset(reference_frame_row_start, reference_frame_to_store, columns);
331     std::fill(mv, mv + columns, mv_to_store);
332     reference_frame_row_start += stride;
333     mv += stride;
334   } while (--rows != 0);
335 }
336 
337 // Inverse transform process assumes that the quantized coefficients are stored
338 // as a virtual 2d array of size |tx_width| x tx_height. If transform width is
339 // 64, then this assumption is broken because the scan order used for populating
340 // the coefficients for such transforms is the same as the one used for
341 // corresponding transform with width 32 (e.g. the scan order used for 64x16 is
342 // the same as the one used for 32x16). So we must restore the coefficients to
343 // their correct positions and clean the positions they occupied.
344 template <typename ResidualType>
MoveCoefficientsForTxWidth64(int clamped_tx_height,int tx_width,ResidualType * residual)345 void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
346                                   ResidualType* residual) {
347   if (tx_width != 64) return;
348   const int rows = clamped_tx_height - 2;
349   auto* src = residual + 32 * rows;
350   residual += 64 * rows;
351   // Process 2 rows in each loop in reverse order to avoid overwrite.
352   int x = rows >> 1;
353   do {
354     // The 2 rows can be processed in order.
355     memcpy(residual, src, 32 * sizeof(src[0]));
356     memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
357     memset(src + 32, 0, 32 * sizeof(src[0]));
358     src -= 64;
359     residual -= 128;
360   } while (--x);
361   // Process the second row. The first row is already correct.
362   memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
363   memset(src + 32, 0, 32 * sizeof(src[0]));
364 }
365 
GetClampParameters(const Tile::Block & block,int min[2],int max[2])366 void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
367   // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
368   // and 5.11.54).
369   constexpr int kMvBorder4x4 = 4;
370   const int row_border = kMvBorder4x4 + block.height4x4;
371   const int column_border = kMvBorder4x4 + block.width4x4;
372   const int macroblocks_to_top_edge = -block.row4x4;
373   const int macroblocks_to_bottom_edge =
374       block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
375   const int macroblocks_to_left_edge = -block.column4x4;
376   const int macroblocks_to_right_edge =
377       block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
378   min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
379   min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
380   max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
381   max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
382 }
383 
384 // Section 8.3.2 in the spec, under coeff_base_eob.
GetCoeffBaseContextEob(TransformSize tx_size,int index)385 int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
386   if (index == 0) return 0;
387   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
388   const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
389   const int tx_height = kTransformHeight[adjusted_tx_size];
390   if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
391   if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
392   return 3;
393 }
394 
395 // Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
396 // on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
397 // the end of block case.
GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2,int pos,TransformClass tx_class)398 int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
399                                 TransformClass tx_class) {
400   if (pos == 0) return 0;
401   const int tx_width = 1 << adjusted_tx_width_log2;
402   const int row = pos >> adjusted_tx_width_log2;
403   const int column = pos & (tx_width - 1);
404   // This return statement is equivalent to:
405   // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
406   //         (tx_class == kTransformClassHorizontal && column == 0) ||
407   //         (tx_class == kTransformClassVertical && row == 0))
408   //            ? 7
409   //            : 14;
410   return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
411                  static_cast<int>((row | column) < 2)) |
412                 (tx_class & static_cast<int>(column == 0)) |
413                 ((tx_class >> 1) & static_cast<int>(row == 0)));
414 }
415 
416 }  // namespace
417 
Tile(int tile_number,const uint8_t * const data,size_t size,const ObuSequenceHeader & sequence_header,const ObuFrameHeader & frame_header,RefCountedBuffer * const current_frame,const DecoderState & state,FrameScratchBuffer * const frame_scratch_buffer,const WedgeMaskArray & wedge_masks,const QuantizerMatrix & quantizer_matrix,SymbolDecoderContext * const saved_symbol_decoder_context,const SegmentationMap * prev_segment_ids,PostFilter * const post_filter,const dsp::Dsp * const dsp,ThreadPool * const thread_pool,BlockingCounterWithStatus * const pending_tiles,bool frame_parallel,bool use_intra_prediction_buffer,bool parse_only)418 Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
419            const ObuSequenceHeader& sequence_header,
420            const ObuFrameHeader& frame_header,
421            RefCountedBuffer* const current_frame, const DecoderState& state,
422            FrameScratchBuffer* const frame_scratch_buffer,
423            const WedgeMaskArray& wedge_masks,
424            const QuantizerMatrix& quantizer_matrix,
425            SymbolDecoderContext* const saved_symbol_decoder_context,
426            const SegmentationMap* prev_segment_ids,
427            PostFilter* const post_filter, const dsp::Dsp* const dsp,
428            ThreadPool* const thread_pool,
429            BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
430            bool use_intra_prediction_buffer, bool parse_only)
431     : number_(tile_number),
432       row_(number_ / frame_header.tile_info.tile_columns),
433       column_(number_ % frame_header.tile_info.tile_columns),
434       data_(data),
435       size_(size),
436       read_deltas_(false),
437       subsampling_x_{0, sequence_header.color_config.subsampling_x,
438                      sequence_header.color_config.subsampling_x},
439       subsampling_y_{0, sequence_header.color_config.subsampling_y,
440                      sequence_header.color_config.subsampling_y},
441       current_quantizer_index_(frame_header.quantizer.base_index),
442       sequence_header_(sequence_header),
443       frame_header_(frame_header),
444       reference_frame_sign_bias_(state.reference_frame_sign_bias),
445       reference_frames_(state.reference_frame),
446       motion_field_(frame_scratch_buffer->motion_field),
447       reference_order_hint_(state.reference_order_hint),
448       wedge_masks_(wedge_masks),
449       quantizer_matrix_(quantizer_matrix),
450       reader_(data_, size_, frame_header_.enable_cdf_update),
451       symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
452       saved_symbol_decoder_context_(saved_symbol_decoder_context),
453       prev_segment_ids_(prev_segment_ids),
454       dsp_(*dsp),
455       post_filter_(*post_filter),
456       block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
457       quantizer_(sequence_header_.color_config.bitdepth,
458                  &frame_header_.quantizer),
459       residual_size_((sequence_header_.color_config.bitdepth == 8)
460                          ? sizeof(int16_t)
461                          : sizeof(int32_t)),
462       intra_block_copy_lag_(
463           frame_header_.allow_intrabc
464               ? (sequence_header_.use_128x128_superblock ? 3 : 5)
465               : 1),
466       current_frame_(*current_frame),
467       cdef_index_(frame_scratch_buffer->cdef_index),
468       cdef_skip_(frame_scratch_buffer->cdef_skip),
469       inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
470       thread_pool_(thread_pool),
471       residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
472       tile_scratch_buffer_pool_(
473           &frame_scratch_buffer->tile_scratch_buffer_pool),
474       pending_tiles_(pending_tiles),
475       frame_parallel_(frame_parallel),
476       use_intra_prediction_buffer_(use_intra_prediction_buffer),
477       intra_prediction_buffer_(
478           use_intra_prediction_buffer_
479               ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
480               : nullptr),
481       parse_only_(parse_only) {
482   row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
483   row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
484   column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
485   column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
486   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
487   const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
488   superblock_rows_ =
489       (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
490   superblock_columns_ =
491       (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
492       block_width4x4_log2;
493   // If |split_parse_and_decode_| is true, we do the necessary setup for
494   // splitting the parsing and the decoding steps. This is done in the following
495   // three cases:
496   //  1) If there is multi-threading within a tile (this is done if
497   //     |thread_pool_| is not nullptr and if there are at least as many
498   //     superblock columns as |intra_block_copy_lag_|).
499   //  2) If |frame_parallel| is true.
500   //  3) If |parse_only_| is true.
501   split_parse_and_decode_ = (thread_pool_ != nullptr &&
502                              superblock_columns_ > intra_block_copy_lag_) ||
503                             frame_parallel || parse_only_;
504   if (frame_parallel_) {
505     reference_frame_progress_cache_.fill(INT_MIN);
506   }
507   memset(delta_lf_, 0, sizeof(delta_lf_));
508   delta_lf_all_zero_ = true;
509   const YuvBuffer& buffer = post_filter_.frame_buffer();
510   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
511     // Verify that the borders are big enough for Reconstruct(). max_tx_length
512     // is the maximum value of tx_width and tx_height for the plane.
513     const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
514     // Reconstruct() may overwrite on the right. Since the right border of a
515     // row is followed in memory by the left border of the next row, the
516     // number of extra pixels to the right of a row is at least the sum of the
517     // left and right borders.
518     //
519     // Note: This assertion actually checks the sum of the left and right
520     // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
521     // and vertically shifted version of |buffer|. Since the sum of the left and
522     // right borders is not changed by the shift, we can just check the sum of
523     // the left and right borders of |buffer|.
524     assert(buffer.left_border(plane) + buffer.right_border(plane) >=
525            max_tx_length - 1);
526     // Reconstruct() may overwrite on the bottom. We need an extra border row
527     // on the bottom because we need the left border of that row.
528     //
529     // Note: This assertion checks the bottom border of
530     // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
531     // shift that the PostFilter constructor applied to |buffer| and reduce the
532     // bottom border by that amount.
533 #ifndef NDEBUG
534     const int vertical_shift = static_cast<int>(
535         (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
536         buffer.stride(plane));
537     const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
538     assert(bottom_border >= max_tx_length);
539 #endif
540     // In AV1, a transform block of height H starts at a y coordinate that is
541     // a multiple of H. If a transform block at the bottom of the frame has
542     // height H, then Reconstruct() will write up to the row with index
543     // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
544     // rows Reconstruct() may write to is
545     // Align(buffer.height(plane), max_tx_length).
546     buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
547                          buffer.stride(plane),
548                          post_filter_.GetUnfilteredBuffer(plane));
549   }
550 }
551 
Init()552 bool Tile::Init() {
553   assert(coefficient_levels_.size() == dc_categories_.size());
554   for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
555     const int contexts_per_plane = (i == kEntropyContextLeft)
556                                        ? frame_header_.rows4x4
557                                        : frame_header_.columns4x4;
558     if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
559       LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
560       return false;
561     }
562     if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
563       LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
564       return false;
565     }
566   }
567   if (split_parse_and_decode_) {
568     assert(residual_buffer_pool_ != nullptr);
569     if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
570                                          /*zero_initialize=*/false)) {
571       LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
572       return false;
573     }
574   } else {
575     // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
576     // checks when parsing quantized coefficients.
577     residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
578         32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
579     if (residual_buffer_ == nullptr) {
580       LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
581       return false;
582     }
583     prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
584     if (prediction_parameters_ == nullptr) {
585       LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
586       return false;
587     }
588   }
589   if (frame_header_.use_ref_frame_mvs) {
590     assert(sequence_header_.enable_order_hint);
591     SetupMotionField(frame_header_, current_frame_, reference_frames_,
592                      row4x4_start_, row4x4_end_, column4x4_start_,
593                      column4x4_end_, &motion_field_);
594   }
595   ResetLoopRestorationParams();
596   if (!top_context_.Resize(superblock_columns_)) {
597     LIBGAV1_DLOG(ERROR, "Allocation of top_context_ failed.");
598     return false;
599   }
600   return true;
601 }
602 
603 template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
ProcessSuperBlockRow(int row4x4,TileScratchBuffer * const scratch_buffer)604 bool Tile::ProcessSuperBlockRow(int row4x4,
605                                 TileScratchBuffer* const scratch_buffer) {
606   if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
607   assert(scratch_buffer != nullptr);
608   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
609   for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
610        column4x4 += block_width4x4) {
611     if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
612                            processing_mode)) {
613       LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
614                    row4x4, column4x4);
615       return false;
616     }
617   }
618   if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
619     SaveSymbolDecoderContext();
620   }
621   if (processing_mode == kProcessingModeDecodeOnly ||
622       processing_mode == kProcessingModeParseAndDecode) {
623     PopulateIntraPredictionBuffer(row4x4);
624   }
625   return true;
626 }
627 
628 // Used in frame parallel mode. The symbol decoder context need not be saved in
629 // this case since it was done when parsing was complete.
630 template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
631     int row4x4, TileScratchBuffer* scratch_buffer);
632 // Used in non frame parallel mode.
633 template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
634     int row4x4, TileScratchBuffer* scratch_buffer);
635 
SaveSymbolDecoderContext()636 void Tile::SaveSymbolDecoderContext() {
637   if (frame_header_.enable_frame_end_update_cdf &&
638       number_ == frame_header_.tile_info.context_update_id) {
639     *saved_symbol_decoder_context_ = symbol_decoder_context_;
640   }
641 }
642 
ParseAndDecode()643 bool Tile::ParseAndDecode() {
644   if (split_parse_and_decode_) {
645     if (!ThreadedParseAndDecode()) return false;
646     SaveSymbolDecoderContext();
647     return true;
648   }
649   std::unique_ptr<TileScratchBuffer> scratch_buffer =
650       tile_scratch_buffer_pool_->Get();
651   if (scratch_buffer == nullptr) {
652     pending_tiles_->Decrement(false);
653     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
654     return false;
655   }
656   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
657   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
658        row4x4 += block_width4x4) {
659     if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
660             row4x4, scratch_buffer.get())) {
661       pending_tiles_->Decrement(false);
662       return false;
663     }
664   }
665   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
666   pending_tiles_->Decrement(true);
667   return true;
668 }
669 
Parse()670 bool Tile::Parse() {
671   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
672   std::unique_ptr<TileScratchBuffer> scratch_buffer =
673       tile_scratch_buffer_pool_->Get();
674   if (scratch_buffer == nullptr) {
675     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
676     return false;
677   }
678   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
679        row4x4 += block_width4x4) {
680     if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
681             row4x4, scratch_buffer.get())) {
682       return false;
683     }
684   }
685   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
686   SaveSymbolDecoderContext();
687   return true;
688 }
689 
Decode(std::mutex * const mutex,int * const superblock_row_progress,std::condition_variable * const superblock_row_progress_condvar)690 bool Tile::Decode(
691     std::mutex* const mutex, int* const superblock_row_progress,
692     std::condition_variable* const superblock_row_progress_condvar) {
693   const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
694   const int block_width4x4_log2 =
695       sequence_header_.use_128x128_superblock ? 5 : 4;
696   std::unique_ptr<TileScratchBuffer> scratch_buffer =
697       tile_scratch_buffer_pool_->Get();
698   if (scratch_buffer == nullptr) {
699     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
700     return false;
701   }
702   for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
703        row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
704     if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
705             row4x4, scratch_buffer.get())) {
706       return false;
707     }
708     if (post_filter_.DoDeblock()) {
709       // Apply vertical deblock filtering for all the columns in this tile
710       // except for the first 64 columns.
711       post_filter_.ApplyDeblockFilter(
712           kLoopFilterTypeVertical, row4x4,
713           column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
714           block_width4x4);
715       // If this is the first superblock row of the tile, then we cannot apply
716       // horizontal deblocking here since we don't know if the top row is
717       // available. So it will be done by the calling thread in that case.
718       if (row4x4 != row4x4_start_) {
719         // Apply horizontal deblock filtering for all the columns in this tile
720         // except for the first and the last 64 columns.
721         // Note about the last tile of each row: For the last tile,
722         // column4x4_end may not be a multiple of 16. In that case it is still
723         // okay to simply subtract 16 since ApplyDeblockFilter() will only do
724         // the filters in increments of 64 columns (or 32 columns for chroma
725         // with subsampling).
726         post_filter_.ApplyDeblockFilter(
727             kLoopFilterTypeHorizontal, row4x4,
728             column4x4_start_ + kNum4x4InLoopFilterUnit,
729             column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
730       }
731     }
732     bool notify;
733     {
734       std::unique_lock<std::mutex> lock(*mutex);
735       notify = ++superblock_row_progress[index] ==
736                frame_header_.tile_info.tile_columns;
737     }
738     if (notify) {
739       // We are done decoding this superblock row. Notify the post filtering
740       // thread.
741       superblock_row_progress_condvar[index].notify_one();
742     }
743   }
744   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
745   return true;
746 }
747 
ThreadedParseAndDecode()748 bool Tile::ThreadedParseAndDecode() {
749   {
750     std::lock_guard<std::mutex> lock(threading_.mutex);
751     if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
752       pending_tiles_->Decrement(false);
753       LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
754       return false;
755     }
756     // Account for the parsing job.
757     ++threading_.pending_jobs;
758   }
759 
760   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
761 
762   // Begin parsing.
763   std::unique_ptr<TileScratchBuffer> scratch_buffer =
764       tile_scratch_buffer_pool_->Get();
765   if (scratch_buffer == nullptr) {
766     pending_tiles_->Decrement(false);
767     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
768     return false;
769   }
770   for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
771        row4x4 += block_width4x4, ++row_index) {
772     for (int column4x4 = column4x4_start_, column_index = 0;
773          column4x4 < column4x4_end_;
774          column4x4 += block_width4x4, ++column_index) {
775       if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
776                              kProcessingModeParseOnly)) {
777         std::lock_guard<std::mutex> lock(threading_.mutex);
778         threading_.abort = true;
779         break;
780       }
781       std::unique_lock<std::mutex> lock(threading_.mutex);
782       if (threading_.abort) break;
783       threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
784       // Schedule the decoding of this superblock if it is allowed.
785       if (CanDecode(row_index, column_index)) {
786         ++threading_.pending_jobs;
787         threading_.sb_state[row_index][column_index] =
788             kSuperBlockStateScheduled;
789         lock.unlock();
790         thread_pool_->Schedule(
791             [this, row_index, column_index, block_width4x4]() {
792               DecodeSuperBlock(row_index, column_index, block_width4x4);
793             });
794       }
795     }
796     std::lock_guard<std::mutex> lock(threading_.mutex);
797     if (threading_.abort) break;
798   }
799   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
800 
801   // We are done parsing. We can return here since the calling thread will make
802   // sure that it waits for all the superblocks to be decoded.
803   //
804   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
805   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
806   // is called.
807   threading_.mutex.lock();
808   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
809   const bool job_succeeded = !threading_.abort;
810   threading_.mutex.unlock();
811   if (no_pending_jobs) {
812     // We are done parsing and decoding this tile.
813     pending_tiles_->Decrement(job_succeeded);
814   }
815   return job_succeeded;
816 }
817 
CanDecode(int row_index,int column_index) const818 bool Tile::CanDecode(int row_index, int column_index) const {
819   assert(row_index >= 0);
820   assert(column_index >= 0);
821   // If |threading_.sb_state[row_index][column_index]| is not equal to
822   // kSuperBlockStateParsed, then return false. This is ok because if
823   // |threading_.sb_state[row_index][column_index]| is equal to:
824   //   kSuperBlockStateNone - then the superblock is not yet parsed.
825   //   kSuperBlockStateScheduled - then the superblock is already scheduled for
826   //                               decode.
827   //   kSuperBlockStateDecoded - then the superblock has already been decoded.
828   if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
829       threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
830     return false;
831   }
832   // First superblock has no dependencies.
833   if (row_index == 0 && column_index == 0) {
834     return true;
835   }
836   // Superblocks in the first row only depend on the superblock to the left of
837   // it.
838   if (row_index == 0) {
839     return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
840   }
841   // All other superblocks depend on superblock to the left of it (if one
842   // exists) and superblock to the top right with a lag of
843   // |intra_block_copy_lag_| (if one exists).
844   const int top_right_column_index =
845       std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
846   return threading_.sb_state[row_index - 1][top_right_column_index] ==
847              kSuperBlockStateDecoded &&
848          (column_index == 0 ||
849           threading_.sb_state[row_index][column_index - 1] ==
850               kSuperBlockStateDecoded);
851 }
852 
DecodeSuperBlock(int row_index,int column_index,int block_width4x4)853 void Tile::DecodeSuperBlock(int row_index, int column_index,
854                             int block_width4x4) {
855   const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
856   const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
857   std::unique_ptr<TileScratchBuffer> scratch_buffer =
858       tile_scratch_buffer_pool_->Get();
859   bool ok = scratch_buffer != nullptr;
860   if (ok) {
861     ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
862                            kProcessingModeDecodeOnly);
863     tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
864   }
865   std::unique_lock<std::mutex> lock(threading_.mutex);
866   if (ok) {
867     threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
868     // Candidate rows and columns that we could potentially begin the decoding
869     // (if it is allowed to do so). The candidates are:
870     //   1) The superblock to the bottom-left of the current superblock with a
871     //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
872     //   row in case there are less than |intra_block_copy_lag_| superblock
873     //   columns in the Tile).
874     //   2) The superblock to the right of the current superblock.
875     const int candidate_row_indices[] = {row_index + 1, row_index};
876     const int candidate_column_indices[] = {
877         std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
878     for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
879          ++i) {
880       const int candidate_row_index = candidate_row_indices[i];
881       const int candidate_column_index = candidate_column_indices[i];
882       if (!CanDecode(candidate_row_index, candidate_column_index)) {
883         continue;
884       }
885       ++threading_.pending_jobs;
886       threading_.sb_state[candidate_row_index][candidate_column_index] =
887           kSuperBlockStateScheduled;
888       lock.unlock();
889       thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
890                               block_width4x4]() {
891         DecodeSuperBlock(candidate_row_index, candidate_column_index,
892                          block_width4x4);
893       });
894       lock.lock();
895     }
896   } else {
897     threading_.abort = true;
898   }
899   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
900   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
901   // is called.
902   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
903   const bool job_succeeded = !threading_.abort;
904   lock.unlock();
905   if (no_pending_jobs) {
906     // We are done parsing and decoding this tile.
907     pending_tiles_->Decrement(job_succeeded);
908   }
909 }
910 
PopulateIntraPredictionBuffer(int row4x4)911 void Tile::PopulateIntraPredictionBuffer(int row4x4) {
912   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
913   if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
914     return;
915   }
916   const size_t pixel_size =
917       (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
918                                                    : sizeof(uint16_t));
919   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
920     const int row_to_copy =
921         (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
922     const size_t pixels_to_copy =
923         (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
924          subsampling_x_[plane]) *
925         pixel_size;
926     const size_t column_start =
927         MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
928     void* start;
929 #if LIBGAV1_MAX_BITDEPTH >= 10
930     if (sequence_header_.color_config.bitdepth > 8) {
931       Array2DView<uint16_t> buffer(
932           buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
933           reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
934       start = &buffer[row_to_copy][column_start];
935     } else  // NOLINT
936 #endif
937     {
938       start = &buffer_[plane][row_to_copy][column_start];
939     }
940     memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
941            start, pixels_to_copy);
942   }
943 }
944 
GetTransformAllZeroContext(const Block & block,Plane plane,TransformSize tx_size,int x4,int y4,int w4,int h4)945 int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
946                                      TransformSize tx_size, int x4, int y4,
947                                      int w4, int h4) {
948   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
949   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
950 
951   const int tx_width = kTransformWidth[tx_size];
952   const int tx_height = kTransformHeight[tx_size];
953   const BlockSize plane_size = block.residual_size[plane];
954   const int block_width = kBlockWidthPixels[plane_size];
955   const int block_height = kBlockHeightPixels[plane_size];
956 
957   int top = 0;
958   int left = 0;
959   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
960   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
961   if (plane == kPlaneY) {
962     if (block_width == tx_width && block_height == tx_height) return 0;
963     const uint8_t* coefficient_levels =
964         &coefficient_levels_[kEntropyContextTop][plane][x4];
965     for (int i = 0; i < num_top_elements; ++i) {
966       top = std::max(top, static_cast<int>(coefficient_levels[i]));
967     }
968     coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
969     for (int i = 0; i < num_left_elements; ++i) {
970       left = std::max(left, static_cast<int>(coefficient_levels[i]));
971     }
972     assert(top <= 4);
973     assert(left <= 4);
974     // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
975     // for top and left.
976     return kAllZeroContextsByTopLeft[top][left];
977   }
978   const uint8_t* coefficient_levels =
979       &coefficient_levels_[kEntropyContextTop][plane][x4];
980   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
981   for (int i = 0; i < num_top_elements; ++i) {
982     top |= coefficient_levels[i];
983     top |= dc_categories[i];
984   }
985   coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
986   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
987   for (int i = 0; i < num_left_elements; ++i) {
988     left |= coefficient_levels[i];
989     left |= dc_categories[i];
990   }
991   return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
992          3 * static_cast<int>(block_width * block_height >
993                               tx_width * tx_height);
994 }
995 
GetTransformSet(TransformSize tx_size,bool is_inter) const996 TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
997   const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
998   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
999   if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
1000   if (is_inter) {
1001     if (frame_header_.reduced_tx_set ||
1002         tx_size_square_max == kTransformSize32x32) {
1003       return kTransformSetInter3;
1004     }
1005     if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
1006     return kTransformSetInter1;
1007   }
1008   if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
1009   if (frame_header_.reduced_tx_set ||
1010       tx_size_square_min == kTransformSize16x16) {
1011     return kTransformSetIntra2;
1012   }
1013   return kTransformSetIntra1;
1014 }
1015 
ComputeTransformType(const Block & block,Plane plane,TransformSize tx_size,int block_x,int block_y)1016 TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
1017                                          TransformSize tx_size, int block_x,
1018                                          int block_y) {
1019   const BlockParameters& bp = *block.bp;
1020   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
1021   if (frame_header_.segmentation
1022           .lossless[bp.prediction_parameters->segment_id] ||
1023       tx_size_square_max == kTransformSize64x64) {
1024     return kTransformTypeDctDct;
1025   }
1026   if (plane == kPlaneY) {
1027     return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
1028   }
1029   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1030   TransformType tx_type;
1031   if (bp.is_inter) {
1032     const int x4 =
1033         std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
1034     const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
1035     tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
1036   } else {
1037     tx_type = kModeToTransformType[bp.prediction_parameters->uv_mode];
1038   }
1039   return kTransformTypeInSetMask[tx_set].Contains(tx_type)
1040              ? tx_type
1041              : kTransformTypeDctDct;
1042 }
1043 
ReadTransformType(const Block & block,int x4,int y4,TransformSize tx_size)1044 void Tile::ReadTransformType(const Block& block, int x4, int y4,
1045                              TransformSize tx_size) {
1046   BlockParameters& bp = *block.bp;
1047   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1048 
1049   TransformType tx_type = kTransformTypeDctDct;
1050   if (tx_set != kTransformSetDctOnly &&
1051       frame_header_.segmentation.qindex[bp.prediction_parameters->segment_id] >
1052           0) {
1053     const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
1054     const int cdf_tx_size_index =
1055         TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
1056     uint16_t* cdf;
1057     if (bp.is_inter) {
1058       cdf = symbol_decoder_context_
1059                 .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
1060       switch (tx_set) {
1061         case kTransformSetInter1:
1062           tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
1063           break;
1064         case kTransformSetInter2:
1065           tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
1066           break;
1067         default:
1068           assert(tx_set == kTransformSetInter3);
1069           tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
1070           break;
1071       }
1072     } else {
1073       // Backup the current set of warnings and disable -Warray-bounds for this
1074       // block as the compiler cannot, in all cases, determine whether
1075       // |intra_mode| is within [0, kIntraPredictionModesY).
1076 #ifdef __GNUC__
1077 #pragma GCC diagnostic push
1078 #pragma GCC diagnostic ignored "-Warray-bounds"
1079 #endif
1080       const PredictionMode intra_mode =
1081           block.bp->prediction_parameters->use_filter_intra
1082               ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
1083                                                      ->filter_intra_mode]
1084               : bp.y_mode;
1085       assert(intra_mode < kIntraPredictionModesY);
1086       cdf = symbol_decoder_context_
1087                 .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_mode];
1088       assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
1089       tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
1090                                                ? reader_.ReadSymbol<7>(cdf)
1091                                                : reader_.ReadSymbol<5>(cdf));
1092       // Restore the previous set of compiler warnings.
1093 #ifdef __GNUC__
1094 #pragma GCC diagnostic pop
1095 #endif
1096     }
1097 
1098     // This array does not contain an entry for kTransformSetDctOnly, so the
1099     // first dimension needs to be offset by 1.
1100     tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
1101   }
1102   SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
1103                    kTransformHeight4x4[tx_size], tx_type, transform_types_);
1104 }
1105 
1106 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1107 // Bottom boundary checks are avoided by the padded rows.
1108 // For a coefficient near the right boundary, the two right neighbors and the
1109 // one bottom-right neighbor may be out of boundary. We don't check the right
1110 // boundary for them, because the out of boundary neighbors project to positions
1111 // above the diagonal line which goes through the current coefficient and these
1112 // positions are still all 0s according to the diagonal scan order.
1113 template <typename ResidualType>
ReadCoeffBase2D(const uint16_t * scan,TransformSize tx_size,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1114 void Tile::ReadCoeffBase2D(
1115     const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1116     int eob,
1117     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1118     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1119                                  [kCoeffBaseRangeSymbolCount + 1],
1120     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1121   const int tx_width = 1 << adjusted_tx_width_log2;
1122   for (int i = eob - 2; i >= 1; --i) {
1123     const uint16_t pos = scan[i];
1124     const int row = pos >> adjusted_tx_width_log2;
1125     const int column = pos & (tx_width - 1);
1126     auto* const quantized = &quantized_buffer[pos];
1127     auto* const levels = &level_buffer[pos];
1128     const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
1129                              levels[tx_width + 1] + levels[2] +
1130                              levels[MultiplyBy2(tx_width)];
1131     const int context =
1132         ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1133         kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
1134     int level =
1135         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1136     levels[0] = level;
1137     if (level > kNumQuantizerBaseLevels) {
1138       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1139       // + 1, because we clip the overall output to 6 and the unclipped
1140       // quantized values will always result in an output of greater than 6.
1141       int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1142                                           quantized[tx_width] +       // {1, 0}
1143                                           quantized[tx_width + 1]));  // {1, 1}
1144       context += 14 >> static_cast<int>((row | column) < 2);
1145       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1146     }
1147     quantized[0] = level;
1148   }
1149   // Read position 0.
1150   {
1151     auto* const quantized = &quantized_buffer[0];
1152     int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
1153     level_buffer[0] = level;
1154     if (level > kNumQuantizerBaseLevels) {
1155       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1156       // + 1, because we clip the overall output to 6 and the unclipped
1157       // quantized values will always result in an output of greater than 6.
1158       const int context =
1159           std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1160                                 quantized[tx_width] +       // {1, 0}
1161                                 quantized[tx_width + 1]));  // {1, 1}
1162       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1163     }
1164     quantized[0] = level;
1165   }
1166 }
1167 
1168 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1169 // Bottom boundary checks are avoided by the padded rows.
1170 // For a coefficient near the right boundary, the four right neighbors may be
1171 // out of boundary. We don't do the boundary check for the first three right
1172 // neighbors, because even for the transform blocks with smallest width 4, the
1173 // first three out of boundary neighbors project to positions left of the
1174 // current coefficient and these positions are still all 0s according to the
1175 // column scan order. However, when transform block width is 4 and the current
1176 // coefficient is on the right boundary, its fourth right neighbor projects to
1177 // the under position on the same column, which could be nonzero. Therefore, we
1178 // must skip the fourth right neighbor. To make it simple, for any coefficient,
1179 // we always do the boundary check for its fourth right neighbor.
1180 template <typename ResidualType>
ReadCoeffBaseHorizontal(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1181 void Tile::ReadCoeffBaseHorizontal(
1182     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1183     int eob,
1184     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1185     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1186                                  [kCoeffBaseRangeSymbolCount + 1],
1187     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1188   const int tx_width = 1 << adjusted_tx_width_log2;
1189   int i = eob - 2;
1190   do {
1191     const uint16_t pos = scan[i];
1192     const int column = pos & (tx_width - 1);
1193     auto* const quantized = &quantized_buffer[pos];
1194     auto* const levels = &level_buffer[pos];
1195     const int neighbor_sum =
1196         1 + (levels[1] +                                  // {0, 1}
1197              levels[tx_width] +                           // {1, 0}
1198              levels[2] +                                  // {0, 2}
1199              levels[3] +                                  // {0, 3}
1200              ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
1201     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1202                         kCoeffBasePositionContextOffset[column];
1203     int level =
1204         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1205     levels[0] = level;
1206     if (level > kNumQuantizerBaseLevels) {
1207       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1208       // + 1, because we clip the overall output to 6 and the unclipped
1209       // quantized values will always result in an output of greater than 6.
1210       int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
1211                                           quantized[tx_width] +  // {1, 0}
1212                                           quantized[2]));        // {0, 2}
1213       if (pos != 0) {
1214         context += 14 >> static_cast<int>(column == 0);
1215       }
1216       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1217     }
1218     quantized[0] = level;
1219   } while (--i >= 0);
1220 }
1221 
1222 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1223 // Bottom boundary checks are avoided by the padded rows.
1224 // Right boundary check is performed explicitly.
1225 template <typename ResidualType>
ReadCoeffBaseVertical(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1226 void Tile::ReadCoeffBaseVertical(
1227     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1228     int eob,
1229     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1230     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1231                                  [kCoeffBaseRangeSymbolCount + 1],
1232     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1233   const int tx_width = 1 << adjusted_tx_width_log2;
1234   int i = eob - 2;
1235   do {
1236     const uint16_t pos = scan[i];
1237     const int row = pos >> adjusted_tx_width_log2;
1238     const int column = pos & (tx_width - 1);
1239     auto* const quantized = &quantized_buffer[pos];
1240     auto* const levels = &level_buffer[pos];
1241     const int neighbor_sum =
1242         1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
1243              levels[tx_width] +                           // {1, 0}
1244              levels[MultiplyBy2(tx_width)] +              // {2, 0}
1245              levels[tx_width * 3] +                       // {3, 0}
1246              levels[MultiplyBy4(tx_width)]);              // {4, 0}
1247     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1248                         kCoeffBasePositionContextOffset[row];
1249     int level =
1250         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1251     levels[0] = level;
1252     if (level > kNumQuantizerBaseLevels) {
1253       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1254       // + 1, because we clip the overall output to 6 and the unclipped
1255       // quantized values will always result in an output of greater than 6.
1256       const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
1257       int context =
1258           std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
1259                                 quantized[tx_width] +                // {1, 0}
1260                                 quantized[MultiplyBy2(tx_width)]));  // {2, 0}
1261       if (pos != 0) {
1262         context += 14 >> static_cast<int>(row == 0);
1263       }
1264       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1265     }
1266     quantized[0] = level;
1267   } while (--i >= 0);
1268 }
1269 
GetDcSignContext(int x4,int y4,int w4,int h4,Plane plane)1270 int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
1271   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1272   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
1273   // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
1274   int8_t dc_sign = std::accumulate(
1275       dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
1276   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1277   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
1278   dc_sign = std::accumulate(
1279       dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
1280   // This return statement is equivalent to:
1281   //   if (dc_sign < 0) return 1;
1282   //   if (dc_sign > 0) return 2;
1283   //   return 0;
1284   // And it is better than:
1285   //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
1286   return static_cast<int>(dc_sign < 0) +
1287          MultiplyBy2(static_cast<int>(dc_sign > 0));
1288 }
1289 
SetEntropyContexts(int x4,int y4,int w4,int h4,Plane plane,uint8_t coefficient_level,int8_t dc_category)1290 void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
1291                               uint8_t coefficient_level, int8_t dc_category) {
1292   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1293   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
1294   memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
1295          num_top_elements);
1296   memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
1297          num_top_elements);
1298   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1299   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
1300   memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
1301          coefficient_level, num_left_elements);
1302   memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
1303          num_left_elements);
1304 }
1305 
1306 template <typename ResidualType, bool is_dc_coefficient>
ReadSignAndApplyDequantization(const uint16_t * const scan,int i,int q_value,const uint8_t * const quantizer_matrix,int shift,int max_value,uint16_t * const dc_sign_cdf,int8_t * const dc_category,int * const coefficient_level,ResidualType * residual_buffer)1307 bool Tile::ReadSignAndApplyDequantization(
1308     const uint16_t* const scan, int i, int q_value,
1309     const uint8_t* const quantizer_matrix, int shift, int max_value,
1310     uint16_t* const dc_sign_cdf, int8_t* const dc_category,
1311     int* const coefficient_level, ResidualType* residual_buffer) {
1312   const int pos = is_dc_coefficient ? 0 : scan[i];
1313   // If residual_buffer[pos] is zero, then the rest of the function has no
1314   // effect.
1315   int level = residual_buffer[pos];
1316   if (level == 0) return true;
1317   const int sign = is_dc_coefficient
1318                        ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
1319                        : reader_.ReadBit();
1320   if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
1321     int length = 0;
1322     bool golomb_length_bit = false;
1323     do {
1324       golomb_length_bit = reader_.ReadBit() != 0;
1325       ++length;
1326       if (length > 20) {
1327         LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
1328         return false;
1329       }
1330     } while (!golomb_length_bit);
1331     int x = 1;
1332     for (int i = length - 2; i >= 0; --i) {
1333       x = (x << 1) | reader_.ReadBit();
1334     }
1335     level += x - 1;
1336   }
1337   if (is_dc_coefficient) {
1338     *dc_category = (sign != 0) ? -1 : 1;
1339   }
1340   level &= 0xfffff;
1341   *coefficient_level += level;
1342   // Apply dequantization. Step 1 of section 7.12.3 in the spec.
1343   int q = q_value;
1344   if (quantizer_matrix != nullptr) {
1345     q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
1346   }
1347   // The intermediate multiplication can exceed 32 bits, so it has to be
1348   // performed by promoting one of the values to int64_t.
1349   int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
1350   dequantized_value >>= shift;
1351   // At this point:
1352   //   * |dequantized_value| is always non-negative.
1353   //   * |sign| can be either 0 or 1.
1354   //   * min_value = -(max_value + 1).
1355   // We need to apply the following:
1356   // dequantized_value = sign ? -dequantized_value : dequantized_value;
1357   // dequantized_value = Clip3(dequantized_value, min_value, max_value);
1358   //
1359   // Note that -x == ~(x - 1).
1360   //
1361   // Now, The above two lines can be done with a std::min and xor as follows:
1362   dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
1363   residual_buffer[pos] = dequantized_value;
1364   return true;
1365 }
1366 
ReadCoeffBaseRange(uint16_t * cdf)1367 int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
1368   int level = 0;
1369   for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
1370     const int coeff_base_range =
1371         reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
1372     level += coeff_base_range;
1373     if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
1374   }
1375   return level;
1376 }
1377 
1378 template <typename ResidualType>
ReadTransformCoefficients(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType * const tx_type)1379 int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
1380                                     int start_x, int start_y,
1381                                     TransformSize tx_size,
1382                                     TransformType* const tx_type) {
1383   const int x4 = DivideBy4(start_x);
1384   const int y4 = DivideBy4(start_y);
1385   const int w4 = kTransformWidth4x4[tx_size];
1386   const int h4 = kTransformHeight4x4[tx_size];
1387   const int tx_size_context = kTransformSizeContext[tx_size];
1388   int context =
1389       GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
1390   const bool all_zero = reader_.ReadSymbol(
1391       symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
1392   if (all_zero) {
1393     if (plane == kPlaneY) {
1394       SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
1395                        transform_types_);
1396     }
1397     SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
1398     // This is not used in this case, so it can be set to any value.
1399     *tx_type = kNumTransformTypes;
1400     return 0;
1401   }
1402   const int tx_width = kTransformWidth[tx_size];
1403   const int tx_height = kTransformHeight[tx_size];
1404   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
1405   const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
1406   const int tx_padding =
1407       (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
1408   auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
1409   // Clear padding to avoid bottom boundary checks when parsing quantized
1410   // coefficients.
1411   memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
1412   uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
1413   memset(
1414       level_buffer, 0,
1415       kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
1416           tx_padding);
1417   const int clamped_tx_height = std::min(tx_height, 32);
1418   if (plane == kPlaneY) {
1419     ReadTransformType(block, x4, y4, tx_size);
1420   }
1421   BlockParameters& bp = *block.bp;
1422   *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
1423   const int eob_multi_size = kEobMultiSizeLookup[tx_size];
1424   const PlaneType plane_type = GetPlaneType(plane);
1425   const TransformClass tx_class = GetTransformClass(*tx_type);
1426   context = static_cast<int>(tx_class != kTransformClass2D);
1427   int eob_pt = 1;
1428   switch (eob_multi_size) {
1429     case 0:
1430       eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
1431           symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
1432       break;
1433     case 1:
1434       eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
1435           symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
1436       break;
1437     case 2:
1438       eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
1439           symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
1440       break;
1441     case 3:
1442       eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
1443           symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
1444       break;
1445     case 4:
1446       eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
1447           symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
1448       break;
1449     case 5:
1450       eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
1451           symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
1452       break;
1453     case 6:
1454     default:
1455       eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
1456           symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
1457       break;
1458   }
1459   int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
1460   if (eob_pt >= 3) {
1461     context = eob_pt - 3;
1462     const bool eob_extra = reader_.ReadSymbol(
1463         symbol_decoder_context_
1464             .eob_extra_cdf[tx_size_context][plane_type][context]);
1465     if (eob_extra) eob += 1 << (eob_pt - 3);
1466     for (int i = 1; i < eob_pt - 2; ++i) {
1467       assert(eob_pt - i >= 3);
1468       assert(eob_pt <= kEobPt1024SymbolCount);
1469       if (reader_.ReadBit() != 0) {
1470         eob += 1 << (eob_pt - i - 3);
1471       }
1472     }
1473   }
1474   const uint16_t* scan = kScan[tx_class][tx_size];
1475   const int clamped_tx_size_context = std::min(tx_size_context, 3);
1476   auto coeff_base_range_cdf =
1477       symbol_decoder_context_
1478           .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
1479   // Read the last coefficient.
1480   {
1481     context = GetCoeffBaseContextEob(tx_size, eob - 1);
1482     const uint16_t pos = scan[eob - 1];
1483     int level =
1484         1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
1485                 symbol_decoder_context_
1486                     .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
1487     level_buffer[pos] = level;
1488     if (level > kNumQuantizerBaseLevels) {
1489       level +=
1490           ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
1491               adjusted_tx_width_log2, pos, tx_class)]);
1492     }
1493     residual[pos] = level;
1494   }
1495   if (eob > 1) {
1496     // Read all the other coefficients.
1497     // Lookup used to call the right variant of ReadCoeffBase*() based on the
1498     // transform class.
1499     static constexpr void (Tile::*kGetCoeffBaseFunc[])(
1500         const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1501         int eob,
1502         uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1503         uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1504                                      [kCoeffBaseRangeSymbolCount + 1],
1505         ResidualType* quantized_buffer,
1506         uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
1507                                   &Tile::ReadCoeffBaseHorizontal<ResidualType>,
1508                                   &Tile::ReadCoeffBaseVertical<ResidualType>};
1509     (this->*kGetCoeffBaseFunc[tx_class])(
1510         scan, tx_size, adjusted_tx_width_log2, eob,
1511         symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
1512         coeff_base_range_cdf, residual, level_buffer);
1513   }
1514   const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
1515   const int current_quantizer_index =
1516       GetQIndex(frame_header_.segmentation,
1517                 bp.prediction_parameters->segment_id, current_quantizer_index_);
1518   const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
1519   const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
1520   const int shift = kQuantizationShift[tx_size];
1521   const uint8_t* const quantizer_matrix =
1522       (frame_header_.quantizer.use_matrix &&
1523        *tx_type < kTransformTypeIdentityIdentity &&
1524        !frame_header_.segmentation
1525             .lossless[bp.prediction_parameters->segment_id] &&
1526        frame_header_.quantizer.matrix_level[plane] < 15)
1527           ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
1528                              [plane_type][adjusted_tx_size]
1529                                  .get()
1530           : nullptr;
1531   int coefficient_level = 0;
1532   int8_t dc_category = 0;
1533   uint16_t* const dc_sign_cdf =
1534       (residual[0] != 0)
1535           ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
1536                 x4, y4, w4, h4, plane)]
1537           : nullptr;
1538   assert(scan[0] == 0);
1539   if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
1540           scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
1541           &dc_category, &coefficient_level, residual)) {
1542     return -1;
1543   }
1544   if (eob > 1) {
1545     int i = 1;
1546     do {
1547       if (!ReadSignAndApplyDequantization<ResidualType,
1548                                           /*is_dc_coefficient=*/false>(
1549               scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
1550               nullptr, &coefficient_level, residual)) {
1551         return -1;
1552       }
1553     } while (++i < eob);
1554     MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
1555   }
1556   SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
1557                      dc_category);
1558   if (split_parse_and_decode_) {
1559     *block.residual += tx_width * tx_height * residual_size_;
1560   }
1561   return eob;
1562 }
1563 
1564 // CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
1565 // |function| depending on the value of |sequence_header_.color_config.bitdepth|
1566 // with the variadic arguments.
1567 #if LIBGAV1_MAX_BITDEPTH >= 10
1568 #define CALL_BITDEPTH_FUNCTION(function, ...)         \
1569   do {                                                \
1570     if (sequence_header_.color_config.bitdepth > 8) { \
1571       function<uint16_t>(__VA_ARGS__);                \
1572     } else {                                          \
1573       function<uint8_t>(__VA_ARGS__);                 \
1574     }                                                 \
1575   } while (false)
1576 #else
1577 #define CALL_BITDEPTH_FUNCTION(function, ...) \
1578   do {                                        \
1579     function<uint8_t>(__VA_ARGS__);           \
1580   } while (false)
1581 #endif
1582 
TransformBlock(const Block & block,Plane plane,int base_x,int base_y,TransformSize tx_size,int x,int y,ProcessingMode mode)1583 bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
1584                           int base_y, TransformSize tx_size, int x, int y,
1585                           ProcessingMode mode) {
1586   BlockParameters& bp = *block.bp;
1587   const int subsampling_x = subsampling_x_[plane];
1588   const int subsampling_y = subsampling_y_[plane];
1589   const int start_x = base_x + MultiplyBy4(x);
1590   const int start_y = base_y + MultiplyBy4(y);
1591   const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
1592   const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
1593   if (start_x >= max_x || start_y >= max_y) return true;
1594   const int row = DivideBy4(start_y << subsampling_y);
1595   const int column = DivideBy4(start_x << subsampling_x);
1596   const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
1597   const int sub_block_row4x4 = row & mask;
1598   const int sub_block_column4x4 = column & mask;
1599   const int step_x = kTransformWidth4x4[tx_size];
1600   const int step_y = kTransformHeight4x4[tx_size];
1601   const bool do_decode = mode == kProcessingModeDecodeOnly ||
1602                          mode == kProcessingModeParseAndDecode;
1603   if (do_decode && !bp.is_inter) {
1604     if (bp.prediction_parameters->palette_mode_info.size[GetPlaneType(plane)] >
1605         0) {
1606       CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
1607                              x, y, tx_size);
1608     } else {
1609       const PredictionMode mode =
1610           (plane == kPlaneY) ? bp.y_mode
1611                              : (bp.prediction_parameters->uv_mode ==
1612                                         kPredictionModeChromaFromLuma
1613                                     ? kPredictionModeDc
1614                                     : bp.prediction_parameters->uv_mode);
1615       const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
1616       const int tr_column4x4 =
1617           (sub_block_column4x4 >> subsampling_x) + step_x + 1;
1618       const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
1619       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
1620       const bool has_left = x > 0 || block.left_available[plane];
1621       const bool has_top = y > 0 || block.top_available[plane];
1622 
1623       CALL_BITDEPTH_FUNCTION(
1624           IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
1625           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
1626           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
1627           mode, tx_size);
1628       if (plane != kPlaneY &&
1629           bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
1630         CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
1631                                start_y, tx_size);
1632       }
1633     }
1634     if (plane == kPlaneY) {
1635       block.bp->prediction_parameters->max_luma_width =
1636           start_x + MultiplyBy4(step_x);
1637       block.bp->prediction_parameters->max_luma_height =
1638           start_y + MultiplyBy4(step_y);
1639       block.scratch_buffer->cfl_luma_buffer_valid = false;
1640     }
1641   }
1642   if (!bp.skip) {
1643     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
1644     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
1645     if (mode == kProcessingModeDecodeOnly) {
1646       Queue<TransformParameters>& tx_params =
1647           *residual_buffer_threaded_[sb_row_index][sb_column_index]
1648                ->transform_parameters();
1649       ReconstructBlock(block, plane, start_x, start_y, tx_size,
1650                        tx_params.Front().type,
1651                        tx_params.Front().non_zero_coeff_count);
1652       tx_params.Pop();
1653     } else {
1654       TransformType tx_type;
1655       int non_zero_coeff_count;
1656 #if LIBGAV1_MAX_BITDEPTH >= 10
1657       if (sequence_header_.color_config.bitdepth > 8) {
1658         non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
1659             block, plane, start_x, start_y, tx_size, &tx_type);
1660       } else  // NOLINT
1661 #endif
1662       {
1663         non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
1664             block, plane, start_x, start_y, tx_size, &tx_type);
1665       }
1666       if (non_zero_coeff_count < 0) return false;
1667       if (mode == kProcessingModeParseAndDecode) {
1668         ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
1669                          non_zero_coeff_count);
1670       } else {
1671         assert(mode == kProcessingModeParseOnly);
1672         residual_buffer_threaded_[sb_row_index][sb_column_index]
1673             ->transform_parameters()
1674             ->Push(TransformParameters(tx_type, non_zero_coeff_count));
1675       }
1676     }
1677   }
1678   if (do_decode) {
1679     bool* block_decoded =
1680         &block.scratch_buffer
1681              ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
1682                             [(sub_block_column4x4 >> subsampling_x) + 1];
1683     SetBlockValues<bool>(step_y, step_x, true, block_decoded,
1684                          TileScratchBuffer::kBlockDecodedStride);
1685   }
1686   return true;
1687 }
1688 
TransformTree(const Block & block,int start_x,int start_y,BlockSize plane_size,ProcessingMode mode)1689 bool Tile::TransformTree(const Block& block, int start_x, int start_y,
1690                          BlockSize plane_size, ProcessingMode mode) {
1691   assert(plane_size <= kBlock64x64);
1692   // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
1693   // required is (4 - 1) * 4 + 1 = 13.
1694   Stack<TransformTreeNode, 13> stack;
1695   // It is okay to cast BlockSize to TransformSize here since the enum are
1696   // equivalent for all BlockSize values <= kBlock64x64.
1697   stack.Push(TransformTreeNode(start_x, start_y,
1698                                static_cast<TransformSize>(plane_size)));
1699 
1700   do {
1701     TransformTreeNode node = stack.Pop();
1702     const int row = DivideBy4(node.y);
1703     const int column = DivideBy4(node.x);
1704     if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
1705       continue;
1706     }
1707     const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
1708     const int width = kTransformWidth[node.tx_size];
1709     const int height = kTransformHeight[node.tx_size];
1710     if (width <= kTransformWidth[inter_tx_size] &&
1711         height <= kTransformHeight[inter_tx_size]) {
1712       if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
1713                           mode)) {
1714         return false;
1715       }
1716       continue;
1717     }
1718     // The split transform size look up gives the right transform size that we
1719     // should push in the stack.
1720     //   if (width > height) => transform size whose width is half.
1721     //   if (width < height) => transform size whose height is half.
1722     //   if (width == height) => transform size whose width and height are half.
1723     const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
1724     const int half_width = DivideBy2(width);
1725     if (width > height) {
1726       stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1727       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1728       continue;
1729     }
1730     const int half_height = DivideBy2(height);
1731     if (width < height) {
1732       stack.Push(
1733           TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1734       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1735       continue;
1736     }
1737     stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
1738                                  split_tx_size));
1739     stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1740     stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1741     stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1742   } while (!stack.Empty());
1743   return true;
1744 }
1745 
ReconstructBlock(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType tx_type,int non_zero_coeff_count)1746 void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
1747                             int start_y, TransformSize tx_size,
1748                             TransformType tx_type, int non_zero_coeff_count) {
1749   // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
1750   assert(non_zero_coeff_count >= 0);
1751   if (non_zero_coeff_count == 0) return;
1752 #if LIBGAV1_MAX_BITDEPTH >= 10
1753   if (sequence_header_.color_config.bitdepth > 8) {
1754     Array2DView<uint16_t> buffer(
1755         buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
1756         reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
1757     Reconstruct(dsp_, tx_type, tx_size,
1758                 frame_header_.segmentation
1759                     .lossless[block.bp->prediction_parameters->segment_id],
1760                 reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
1761                 &buffer, non_zero_coeff_count);
1762   } else  // NOLINT
1763 #endif
1764   {
1765     Reconstruct(dsp_, tx_type, tx_size,
1766                 frame_header_.segmentation
1767                     .lossless[block.bp->prediction_parameters->segment_id],
1768                 reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
1769                 &buffer_[plane], non_zero_coeff_count);
1770   }
1771   if (split_parse_and_decode_) {
1772     *block.residual +=
1773         kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
1774   }
1775 }
1776 
Residual(const Block & block,ProcessingMode mode)1777 bool Tile::Residual(const Block& block, ProcessingMode mode) {
1778   const int width_chunks = std::max(1, block.width >> 6);
1779   const int height_chunks = std::max(1, block.height >> 6);
1780   const BlockSize size_chunk4x4 =
1781       (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
1782   const BlockParameters& bp = *block.bp;
1783   for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
1784     for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
1785       const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1786       int plane = kPlaneY;
1787       do {
1788         const int subsampling_x = subsampling_x_[plane];
1789         const int subsampling_y = subsampling_y_[plane];
1790         // For Y Plane, when lossless is true |bp.transform_size| is always
1791         // kTransformSize4x4. So we can simply use |bp.transform_size| here as
1792         // the Y plane's transform size (part of Section 5.11.37 in the spec).
1793         const TransformSize tx_size =
1794             (plane == kPlaneY)
1795                 ? inter_transform_sizes_[block.row4x4][block.column4x4]
1796                 : bp.uv_transform_size;
1797         const BlockSize plane_size =
1798             kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
1799         assert(plane_size != kBlockInvalid);
1800         if (bp.is_inter &&
1801             !frame_header_.segmentation
1802                  .lossless[bp.prediction_parameters->segment_id] &&
1803             plane == kPlaneY) {
1804           const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
1805           const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
1806           const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
1807           const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
1808           if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
1809             return false;
1810           }
1811         } else {
1812           const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
1813           const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
1814           const int step_x = kTransformWidth4x4[tx_size];
1815           const int step_y = kTransformHeight4x4[tx_size];
1816           const int num4x4_wide = kNum4x4BlocksWide[plane_size];
1817           const int num4x4_high = kNum4x4BlocksHigh[plane_size];
1818           for (int y = 0; y < num4x4_high; y += step_y) {
1819             for (int x = 0; x < num4x4_wide; x += step_x) {
1820               if (!TransformBlock(
1821                       block, static_cast<Plane>(plane), base_x, base_y, tx_size,
1822                       x + (MultiplyBy16(chunk_x) >> subsampling_x),
1823                       y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
1824                 return false;
1825               }
1826             }
1827           }
1828         }
1829       } while (++plane < num_planes);
1830     }
1831   }
1832   return true;
1833 }
1834 
1835 // The purpose of this function is to limit the maximum size of motion vectors
1836 // and also, if use_intra_block_copy is true, to additionally constrain the
1837 // motion vector so that the data is fetched from parts of the tile that have
1838 // already been decoded and are not too close to the current block (in order to
1839 // make a pipelined decoder implementation feasible).
IsMvValid(const Block & block,bool is_compound) const1840 bool Tile::IsMvValid(const Block& block, bool is_compound) const {
1841   const BlockParameters& bp = *block.bp;
1842   for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
1843     for (int mv_component : bp.mv.mv[i].mv) {
1844       if (std::abs(mv_component) >= (1 << 14)) {
1845         return false;
1846       }
1847     }
1848   }
1849   if (!block.bp->prediction_parameters->use_intra_block_copy) {
1850     return true;
1851   }
1852   if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
1853     return false;
1854   }
1855   const int delta_row = bp.mv.mv[0].mv[0] >> 3;
1856   const int delta_column = bp.mv.mv[0].mv[1] >> 3;
1857   int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
1858   int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
1859   const int src_bottom_edge = src_top_edge + block.height;
1860   const int src_right_edge = src_left_edge + block.width;
1861   if (block.HasChroma()) {
1862     if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
1863       src_left_edge -= 4;
1864     }
1865     if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
1866       src_top_edge -= 4;
1867     }
1868   }
1869   if (src_top_edge < MultiplyBy4(row4x4_start_) ||
1870       src_left_edge < MultiplyBy4(column4x4_start_) ||
1871       src_bottom_edge > MultiplyBy4(row4x4_end_) ||
1872       src_right_edge > MultiplyBy4(column4x4_end_)) {
1873     return false;
1874   }
1875   // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
1876   const int sb_height_log2 =
1877       6 + static_cast<int>(sequence_header_.use_128x128_superblock);
1878   const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
1879   const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
1880   const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
1881   const int src_64x64_block_column = (src_right_edge - 1) >> 6;
1882   const int total_64x64_blocks_per_row =
1883       ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
1884   const int active_64x64_block =
1885       active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
1886   const int src_64x64_block =
1887       src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
1888   if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
1889     return false;
1890   }
1891 
1892   // Wavefront constraint: use only top left area of frame for reference.
1893   if (src_sb_row > active_sb_row) return false;
1894   const int gradient =
1895       1 + kIntraBlockCopyDelay64x64Blocks +
1896       static_cast<int>(sequence_header_.use_128x128_superblock);
1897   const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
1898   return src_64x64_block_column < active_64x64_block_column -
1899                                       kIntraBlockCopyDelay64x64Blocks +
1900                                       wavefront_offset;
1901 }
1902 
AssignInterMv(const Block & block,bool is_compound)1903 bool Tile::AssignInterMv(const Block& block, bool is_compound) {
1904   int min[2];
1905   int max[2];
1906   GetClampParameters(block, min, max);
1907   BlockParameters& bp = *block.bp;
1908   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1909   bp.mv.mv64 = 0;
1910   if (is_compound) {
1911     for (int i = 0; i < 2; ++i) {
1912       const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
1913       MotionVector predicted_mv;
1914       if (mode == kPredictionModeGlobalMv) {
1915         predicted_mv = prediction_parameters.global_mv[i];
1916       } else {
1917         const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1918                                   (mode == kPredictionModeNewMv &&
1919                                    prediction_parameters.ref_mv_count <= 1))
1920                                      ? 0
1921                                      : prediction_parameters.ref_mv_index;
1922         predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
1923         if (ref_mv_index < prediction_parameters.ref_mv_count) {
1924           predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1925           predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1926         }
1927       }
1928       if (mode == kPredictionModeNewMv) {
1929         ReadMotionVector(block, i);
1930         bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
1931         bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
1932       } else {
1933         bp.mv.mv[i] = predicted_mv;
1934       }
1935     }
1936   } else {
1937     const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
1938     MotionVector predicted_mv;
1939     if (mode == kPredictionModeGlobalMv) {
1940       predicted_mv = prediction_parameters.global_mv[0];
1941     } else {
1942       const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1943                                 (mode == kPredictionModeNewMv &&
1944                                  prediction_parameters.ref_mv_count <= 1))
1945                                    ? 0
1946                                    : prediction_parameters.ref_mv_index;
1947       predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
1948       if (ref_mv_index < prediction_parameters.ref_mv_count) {
1949         predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1950         predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1951       }
1952     }
1953     if (mode == kPredictionModeNewMv) {
1954       ReadMotionVector(block, 0);
1955       bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
1956       bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
1957     } else {
1958       bp.mv.mv[0] = predicted_mv;
1959     }
1960   }
1961   return IsMvValid(block, is_compound);
1962 }
1963 
AssignIntraMv(const Block & block)1964 bool Tile::AssignIntraMv(const Block& block) {
1965   // TODO(linfengz): Check if the clamping process is necessary.
1966   int min[2];
1967   int max[2];
1968   GetClampParameters(block, min, max);
1969   BlockParameters& bp = *block.bp;
1970   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1971   const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
1972   bp.mv.mv64 = 0;
1973   ReadMotionVector(block, 0);
1974   if (ref_mv_0.mv32 == 0) {
1975     const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
1976     if (ref_mv_1.mv32 == 0) {
1977       const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
1978       if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
1979         bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
1980         bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
1981       } else {
1982         bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
1983       }
1984     } else {
1985       bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
1986       bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
1987     }
1988   } else {
1989     bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
1990     bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
1991   }
1992   return IsMvValid(block, /*is_compound=*/false);
1993 }
1994 
ResetEntropyContext(const Block & block)1995 void Tile::ResetEntropyContext(const Block& block) {
1996   const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1997   int plane = kPlaneY;
1998   do {
1999     const int subsampling_x = subsampling_x_[plane];
2000     const int start_x = block.column4x4 >> subsampling_x;
2001     const int end_x =
2002         std::min((block.column4x4 + block.width4x4) >> subsampling_x,
2003                  frame_header_.columns4x4);
2004     memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
2005            end_x - start_x);
2006     memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
2007            end_x - start_x);
2008     const int subsampling_y = subsampling_y_[plane];
2009     const int start_y = block.row4x4 >> subsampling_y;
2010     const int end_y =
2011         std::min((block.row4x4 + block.height4x4) >> subsampling_y,
2012                  frame_header_.rows4x4);
2013     memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
2014            end_y - start_y);
2015     memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
2016            end_y - start_y);
2017   } while (++plane < num_planes);
2018 }
2019 
ComputePrediction(const Block & block)2020 bool Tile::ComputePrediction(const Block& block) {
2021   const BlockParameters& bp = *block.bp;
2022   if (!bp.is_inter) return true;
2023   const int mask =
2024       (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
2025       1;
2026   const int sub_block_row4x4 = block.row4x4 & mask;
2027   const int sub_block_column4x4 = block.column4x4 & mask;
2028   const int plane_count = block.HasChroma() ? PlaneCount() : 1;
2029   // Returns true if this block applies local warping. The state is determined
2030   // in the Y plane and carried for use in the U/V planes.
2031   // But the U/V planes will not apply warping when the block size is smaller
2032   // than 8x8, even if this variable is true.
2033   bool is_local_valid = false;
2034   // Local warping parameters, similar usage as is_local_valid.
2035   GlobalMotion local_warp_params;
2036   int plane = kPlaneY;
2037   do {
2038     const int8_t subsampling_x = subsampling_x_[plane];
2039     const int8_t subsampling_y = subsampling_y_[plane];
2040     const BlockSize plane_size = block.residual_size[plane];
2041     const int block_width4x4 = kNum4x4BlocksWide[plane_size];
2042     const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
2043     const int block_width = MultiplyBy4(block_width4x4);
2044     const int block_height = MultiplyBy4(block_height4x4);
2045     const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
2046     const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
2047     if (bp.reference_frame[1] == kReferenceFrameIntra) {
2048       const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
2049       const int tr_column4x4 =
2050           (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
2051       const int bl_row4x4 =
2052           (sub_block_row4x4 >> subsampling_y) + block_height4x4;
2053       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
2054       const TransformSize tx_size =
2055           k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
2056                                  [k4x4HeightLog2[plane_size]];
2057       const bool has_left = block.left_available[plane];
2058       const bool has_top = block.top_available[plane];
2059       CALL_BITDEPTH_FUNCTION(
2060           IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
2061           has_left, has_top,
2062           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
2063           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
2064           kInterIntraToIntraMode[block.bp->prediction_parameters
2065                                      ->inter_intra_mode],
2066           tx_size);
2067     }
2068     int candidate_row = block.row4x4;
2069     int candidate_column = block.column4x4;
2070     bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
2071     if (!some_use_intra && plane != 0) {
2072       candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
2073       candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
2074       if (candidate_row != block.row4x4) {
2075         // Top block.
2076         const BlockParameters& bp_top =
2077             *block_parameters_holder_.Find(candidate_row, block.column4x4);
2078         some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
2079         if (!some_use_intra && candidate_column != block.column4x4) {
2080           // Top-left block.
2081           const BlockParameters& bp_top_left =
2082               *block_parameters_holder_.Find(candidate_row, candidate_column);
2083           some_use_intra =
2084               bp_top_left.reference_frame[0] == kReferenceFrameIntra;
2085         }
2086       }
2087       if (!some_use_intra && candidate_column != block.column4x4) {
2088         // Left block.
2089         const BlockParameters& bp_left =
2090             *block_parameters_holder_.Find(block.row4x4, candidate_column);
2091         some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
2092       }
2093     }
2094     int prediction_width;
2095     int prediction_height;
2096     if (some_use_intra) {
2097       candidate_row = block.row4x4;
2098       candidate_column = block.column4x4;
2099       prediction_width = block_width;
2100       prediction_height = block_height;
2101     } else {
2102       prediction_width = block.width >> subsampling_x;
2103       prediction_height = block.height >> subsampling_y;
2104     }
2105     int r = 0;
2106     int y = 0;
2107     do {
2108       int c = 0;
2109       int x = 0;
2110       do {
2111         if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
2112                              base_y + y, prediction_width, prediction_height,
2113                              candidate_row + r, candidate_column + c,
2114                              &is_local_valid, &local_warp_params)) {
2115           return false;
2116         }
2117         ++c;
2118         x += prediction_width;
2119       } while (x < block_width);
2120       ++r;
2121       y += prediction_height;
2122     } while (y < block_height);
2123   } while (++plane < plane_count);
2124   return true;
2125 }
2126 
2127 #undef CALL_BITDEPTH_FUNCTION
2128 
PopulateDeblockFilterLevel(const Block & block)2129 void Tile::PopulateDeblockFilterLevel(const Block& block) {
2130   if (!post_filter_.DoDeblock()) return;
2131   BlockParameters& bp = *block.bp;
2132   const int mode_id =
2133       static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
2134   for (int i = 0; i < kFrameLfCount; ++i) {
2135     if (delta_lf_all_zero_) {
2136       bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
2137           bp.prediction_parameters->segment_id, i, bp.reference_frame[0],
2138           mode_id);
2139     } else {
2140       bp.deblock_filter_level[i] =
2141           deblock_filter_levels_[bp.prediction_parameters->segment_id][i]
2142                                 [bp.reference_frame[0]][mode_id];
2143     }
2144   }
2145 }
2146 
PopulateCdefSkip(const Block & block)2147 void Tile::PopulateCdefSkip(const Block& block) {
2148   if (!post_filter_.DoCdef() || block.bp->skip ||
2149       (frame_header_.cdef.bits > 0 &&
2150        cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)] ==
2151            -1)) {
2152     return;
2153   }
2154   // The rest of this function is an efficient version of the following code:
2155   // for (int y = block.row4x4; y < block.row4x4 + block.height4x4; y++) {
2156   //   for (int x = block.column4x4; y < block.column4x4 + block.width4x4;
2157   //        x++) {
2158   //     const uint8_t mask = uint8_t{1} << ((x >> 1) & 0x7);
2159   //     cdef_skip_[y >> 1][x >> 4] |= mask;
2160   //   }
2161   // }
2162 
2163   // For all block widths other than 32, the mask will fit in uint8_t. For
2164   // block width == 32, the mask is always 0xFFFF.
2165   const int bw4 =
2166       std::max(DivideBy2(block.width4x4) + (block.column4x4 & 1), 1);
2167   const uint8_t mask = (block.width4x4 == 32)
2168                            ? 0xFF
2169                            : (uint8_t{0xFF} >> (8 - bw4))
2170                                  << (DivideBy2(block.column4x4) & 0x7);
2171   uint8_t* cdef_skip = &cdef_skip_[block.row4x4 >> 1][block.column4x4 >> 4];
2172   const int stride = cdef_skip_.columns();
2173   int row = 0;
2174   do {
2175     *cdef_skip |= mask;
2176     if (block.width4x4 == 32) {
2177       *(cdef_skip + 1) = 0xFF;
2178     }
2179     cdef_skip += stride;
2180     row += 2;
2181   } while (row < block.height4x4);
2182 }
2183 
ProcessBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2184 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
2185                         TileScratchBuffer* const scratch_buffer,
2186                         ResidualPtr* residual) {
2187   // Do not process the block if the starting point is beyond the visible frame.
2188   // This is equivalent to the has_row/has_column check in the
2189   // decode_partition() section of the spec when partition equals
2190   // kPartitionHorizontal or kPartitionVertical.
2191   if (row4x4 >= frame_header_.rows4x4 ||
2192       column4x4 >= frame_header_.columns4x4) {
2193     return true;
2194   }
2195 
2196   if (split_parse_and_decode_) {
2197     // Push block ordering info to the queue. DecodeBlock() will use this queue
2198     // to decode the blocks in the correct order.
2199     const int sb_row_index = SuperBlockRowIndex(row4x4);
2200     const int sb_column_index = SuperBlockColumnIndex(column4x4);
2201     if (!parse_only_) {
2202       residual_buffer_threaded_[sb_row_index][sb_column_index]
2203           ->partition_tree_order()
2204           ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
2205     }
2206   }
2207 
2208   BlockParameters* bp_ptr =
2209       block_parameters_holder_.Get(row4x4, column4x4, block_size);
2210   if (bp_ptr == nullptr) {
2211     LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
2212     return false;
2213   }
2214   BlockParameters& bp = *bp_ptr;
2215   Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
2216   bp.size = block_size;
2217   bp.prediction_parameters =
2218       split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
2219                                     new (std::nothrow) PredictionParameters())
2220                               : std::move(prediction_parameters_);
2221   if (bp.prediction_parameters == nullptr) return false;
2222   if (!DecodeModeInfo(block)) return false;
2223   if (parse_only_) {
2224     const int block_weight = kBlockWeight[block_size];
2225     weighted_cumulative_block_qp_ += current_quantizer_index_ * block_weight;
2226     cumulative_block_weights_ += block_weight;
2227   }
2228   PopulateDeblockFilterLevel(block);
2229   if (!ReadPaletteTokens(block)) return false;
2230   DecodeTransformSize(block);
2231   // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
2232   bp.uv_transform_size =
2233       frame_header_.segmentation.lossless[bp.prediction_parameters->segment_id]
2234           ? kTransformSize4x4
2235           : kUVTransformSize[block.residual_size[kPlaneU]];
2236   if (bp.skip) ResetEntropyContext(block);
2237   PopulateCdefSkip(block);
2238   if (split_parse_and_decode_) {
2239     if (!Residual(block, kProcessingModeParseOnly)) return false;
2240   } else {
2241     if (!ComputePrediction(block) ||
2242         !Residual(block, kProcessingModeParseAndDecode)) {
2243       return false;
2244     }
2245   }
2246   // If frame_header_.segmentation.enabled is false,
2247   // bp.prediction_parameters->segment_id is 0 for all blocks. We don't need to
2248   // call save bp.prediction_parameters->segment_id in the current frame because
2249   // the current frame's segmentation map will be cleared to all 0s.
2250   //
2251   // If frame_header_.segmentation.enabled is true and
2252   // frame_header_.segmentation.update_map is false, we will copy the previous
2253   // frame's segmentation map to the current frame. So we don't need to call
2254   // save bp.prediction_parameters->segment_id in the current frame.
2255   if (frame_header_.segmentation.enabled &&
2256       frame_header_.segmentation.update_map) {
2257     const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
2258                                  static_cast<int>(block.width4x4));
2259     const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
2260                                  static_cast<int>(block.height4x4));
2261     current_frame_.segmentation_map()->FillBlock(
2262         row4x4, column4x4, x_limit, y_limit,
2263         bp.prediction_parameters->segment_id);
2264   }
2265   StoreMotionFieldMvsIntoCurrentFrame(block);
2266   if (!split_parse_and_decode_) {
2267     prediction_parameters_ = std::move(bp.prediction_parameters);
2268   }
2269   return true;
2270 }
2271 
DecodeBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2272 bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
2273                        TileScratchBuffer* const scratch_buffer,
2274                        ResidualPtr* residual) {
2275   if (row4x4 >= frame_header_.rows4x4 ||
2276       column4x4 >= frame_header_.columns4x4) {
2277     return true;
2278   }
2279   Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
2280   if (!ComputePrediction(block) ||
2281       !Residual(block, kProcessingModeDecodeOnly)) {
2282     return false;
2283   }
2284   block.bp->prediction_parameters.reset(nullptr);
2285   return true;
2286 }
2287 
ProcessPartition(int row4x4_start,int column4x4_start,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2288 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
2289                             TileScratchBuffer* const scratch_buffer,
2290                             ResidualPtr* residual) {
2291   Stack<PartitionTreeNode, kDfsStackSize> stack;
2292 
2293   // Set up the first iteration.
2294   stack.Push(
2295       PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
2296 
2297   // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
2298   // Otherwise, the children are pushed into the stack for future processing.
2299   do {
2300     PartitionTreeNode node = stack.Pop();
2301     int row4x4 = node.row4x4;
2302     int column4x4 = node.column4x4;
2303     BlockSize block_size = node.block_size;
2304 
2305     if (row4x4 >= frame_header_.rows4x4 ||
2306         column4x4 >= frame_header_.columns4x4) {
2307       continue;
2308     }
2309     const int block_width4x4 = kNum4x4BlocksWide[block_size];
2310     assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
2311     const int half_block4x4 = block_width4x4 >> 1;
2312     const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
2313     const bool has_columns =
2314         (column4x4 + half_block4x4) < frame_header_.columns4x4;
2315     Partition partition;
2316     if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
2317                        &partition)) {
2318       LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
2319                    row4x4, column4x4);
2320       return false;
2321     }
2322     const BlockSize sub_size = kSubSize[partition][block_size];
2323     // Section 6.10.4: It is a requirement of bitstream conformance that
2324     // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
2325     // every time subSize is computed.
2326     if (sub_size == kBlockInvalid ||
2327         kPlaneResidualSize[sub_size]
2328                           [sequence_header_.color_config.subsampling_x]
2329                           [sequence_header_.color_config.subsampling_y] ==
2330             kBlockInvalid) {
2331       LIBGAV1_DLOG(
2332           ERROR,
2333           "Invalid sub-block/plane size for row: %d column: %d partition: "
2334           "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
2335           row4x4, column4x4, partition, block_size, sub_size,
2336           sequence_header_.color_config.subsampling_x,
2337           sequence_header_.color_config.subsampling_y);
2338       return false;
2339     }
2340 
2341     const int quarter_block4x4 = half_block4x4 >> 1;
2342     const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
2343     assert(partition == kPartitionNone || sub_size != kBlockInvalid);
2344     switch (partition) {
2345       case kPartitionNone:
2346         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2347                           residual)) {
2348           return false;
2349         }
2350         break;
2351       case kPartitionSplit:
2352         // The children must be added in reverse order since a stack is being
2353         // used.
2354         stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
2355                                      column4x4 + half_block4x4, sub_size));
2356         stack.Push(
2357             PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
2358         stack.Push(
2359             PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
2360         stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
2361         break;
2362       case kPartitionHorizontal:
2363         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2364                           residual) ||
2365             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2366                           scratch_buffer, residual)) {
2367           return false;
2368         }
2369         break;
2370       case kPartitionVertical:
2371         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2372                           residual) ||
2373             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2374                           scratch_buffer, residual)) {
2375           return false;
2376         }
2377         break;
2378       case kPartitionHorizontalWithTopSplit:
2379         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2380                           residual) ||
2381             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2382                           scratch_buffer, residual) ||
2383             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2384                           scratch_buffer, residual)) {
2385           return false;
2386         }
2387         break;
2388       case kPartitionHorizontalWithBottomSplit:
2389         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2390                           residual) ||
2391             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2392                           scratch_buffer, residual) ||
2393             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2394                           split_size, scratch_buffer, residual)) {
2395           return false;
2396         }
2397         break;
2398       case kPartitionVerticalWithLeftSplit:
2399         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2400                           residual) ||
2401             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2402                           scratch_buffer, residual) ||
2403             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2404                           scratch_buffer, residual)) {
2405           return false;
2406         }
2407         break;
2408       case kPartitionVerticalWithRightSplit:
2409         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2410                           residual) ||
2411             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2412                           scratch_buffer, residual) ||
2413             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2414                           split_size, scratch_buffer, residual)) {
2415           return false;
2416         }
2417         break;
2418       case kPartitionHorizontal4:
2419         for (int i = 0; i < 4; ++i) {
2420           if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
2421                             scratch_buffer, residual)) {
2422             return false;
2423           }
2424         }
2425         break;
2426       case kPartitionVertical4:
2427         for (int i = 0; i < 4; ++i) {
2428           if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
2429                             scratch_buffer, residual)) {
2430             return false;
2431           }
2432         }
2433         break;
2434     }
2435   } while (!stack.Empty());
2436   return true;
2437 }
2438 
ResetLoopRestorationParams()2439 void Tile::ResetLoopRestorationParams() {
2440   for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
2441     for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
2442       reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
2443           kSgrProjDefaultMultiplier[i];
2444       for (int j = 0; j < kNumWienerCoefficients; ++j) {
2445         reference_unit_info_[plane].wiener_info.filter[i][j] =
2446             kWienerDefaultFilter[j];
2447       }
2448     }
2449   }
2450 }
2451 
ResetCdef(const int row4x4,const int column4x4)2452 void Tile::ResetCdef(const int row4x4, const int column4x4) {
2453   if (frame_header_.cdef.bits == 0) return;
2454   const int row = DivideBy16(row4x4);
2455   const int column = DivideBy16(column4x4);
2456   cdef_index_[row][column] = -1;
2457   if (sequence_header_.use_128x128_superblock) {
2458     const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
2459     const int border_row = DivideBy16(row4x4 + cdef_size4x4);
2460     const int border_column = DivideBy16(column4x4 + cdef_size4x4);
2461     cdef_index_[row][border_column] = -1;
2462     cdef_index_[border_row][column] = -1;
2463     cdef_index_[border_row][border_column] = -1;
2464   }
2465 }
2466 
ClearBlockDecoded(TileScratchBuffer * const scratch_buffer,int row4x4,int column4x4)2467 void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
2468                              int row4x4, int column4x4) {
2469   // Set everything to false.
2470   memset(scratch_buffer->block_decoded, 0,
2471          sizeof(scratch_buffer->block_decoded));
2472   // Set specific edge cases to true.
2473   const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
2474   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2475     const int subsampling_x = subsampling_x_[plane];
2476     const int subsampling_y = subsampling_y_[plane];
2477     const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
2478     const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
2479     // The memset is equivalent to the following lines in the spec:
2480     // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
2481     //   if ( y < 0 && x < sbWidth4 ) {
2482     //     BlockDecoded[plane][y][x] = 1
2483     //   }
2484     // }
2485     const int num_elements =
2486         std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
2487     memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
2488     // The for loop is equivalent to the following lines in the spec:
2489     // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
2490     //   if ( x < 0 && y < sbHeight4 )
2491     //     BlockDecoded[plane][y][x] = 1
2492     //   }
2493     // }
2494     // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
2495     for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
2496          ++y) {
2497       scratch_buffer->block_decoded[plane][y + 1][0] = true;
2498     }
2499   }
2500 }
2501 
ProcessSuperBlock(int row4x4,int column4x4,TileScratchBuffer * const scratch_buffer,ProcessingMode mode)2502 bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
2503                              TileScratchBuffer* const scratch_buffer,
2504                              ProcessingMode mode) {
2505   const bool parsing =
2506       mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
2507   const bool decoding = mode == kProcessingModeDecodeOnly ||
2508                         mode == kProcessingModeParseAndDecode;
2509   if (parsing) {
2510     read_deltas_ = frame_header_.delta_q.present;
2511     ResetCdef(row4x4, column4x4);
2512   }
2513   if (decoding) {
2514     ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
2515   }
2516   const BlockSize block_size = SuperBlockSize();
2517   if (parsing) {
2518     ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
2519   }
2520   if (parsing && decoding) {
2521     uint8_t* residual_buffer = residual_buffer_.get();
2522     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2523                           &residual_buffer)) {
2524       LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
2525                    column4x4);
2526       return false;
2527     }
2528     return true;
2529   }
2530   const int sb_row_index = SuperBlockRowIndex(row4x4);
2531   const int sb_column_index = SuperBlockColumnIndex(column4x4);
2532   if (parsing) {
2533     residual_buffer_threaded_[sb_row_index][sb_column_index] =
2534         residual_buffer_pool_->Get();
2535     if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
2536       LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
2537       return false;
2538     }
2539     uint8_t* residual_buffer =
2540         residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2541     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2542                           &residual_buffer)) {
2543       LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
2544                    column4x4);
2545       return false;
2546     }
2547     if (parse_only_) {
2548       residual_buffer_pool_->Release(
2549           std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
2550     }
2551   } else {
2552     if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
2553       LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
2554                    row4x4, column4x4);
2555       return false;
2556     }
2557     residual_buffer_pool_->Release(
2558         std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
2559   }
2560   return true;
2561 }
2562 
DecodeSuperBlock(int sb_row_index,int sb_column_index,TileScratchBuffer * const scratch_buffer)2563 bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
2564                             TileScratchBuffer* const scratch_buffer) {
2565   uint8_t* residual_buffer =
2566       residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2567   Queue<PartitionTreeNode>& partition_tree_order =
2568       *residual_buffer_threaded_[sb_row_index][sb_column_index]
2569            ->partition_tree_order();
2570   while (!partition_tree_order.Empty()) {
2571     PartitionTreeNode block = partition_tree_order.Front();
2572     if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
2573                      scratch_buffer, &residual_buffer)) {
2574       LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
2575                    block.row4x4, block.column4x4);
2576       return false;
2577     }
2578     partition_tree_order.Pop();
2579   }
2580   return true;
2581 }
2582 
ReadLoopRestorationCoefficients(int row4x4,int column4x4,BlockSize block_size)2583 void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
2584                                            BlockSize block_size) {
2585   if (frame_header_.allow_intrabc) return;
2586   LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
2587   const bool is_superres_scaled =
2588       frame_header_.width != frame_header_.upscaled_width;
2589   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2590     LoopRestorationUnitInfo unit_info;
2591     if (restoration_info->PopulateUnitInfoForSuperBlock(
2592             static_cast<Plane>(plane), block_size, is_superres_scaled,
2593             frame_header_.superres_scale_denominator, row4x4, column4x4,
2594             &unit_info)) {
2595       for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
2596            ++unit_row) {
2597         for (int unit_column = unit_info.column_start;
2598              unit_column < unit_info.column_end; ++unit_column) {
2599           const int unit_id = unit_row * restoration_info->num_horizontal_units(
2600                                              static_cast<Plane>(plane)) +
2601                               unit_column;
2602           restoration_info->ReadUnitCoefficients(
2603               &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
2604               unit_id, &reference_unit_info_);
2605         }
2606       }
2607     }
2608   }
2609 }
2610 
StoreMotionFieldMvsIntoCurrentFrame(const Block & block)2611 void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
2612   if (frame_header_.refresh_frame_flags == 0 ||
2613       IsIntraFrame(frame_header_.frame_type)) {
2614     return;
2615   }
2616   // Iterate over odd rows/columns beginning at the first odd row/column for the
2617   // block. It is done this way because motion field mvs are only needed at a
2618   // 8x8 granularity.
2619   const int row_start4x4 = block.row4x4 | 1;
2620   const int row_limit4x4 =
2621       std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
2622   if (row_start4x4 >= row_limit4x4) return;
2623   const int column_start4x4 = block.column4x4 | 1;
2624   const int column_limit4x4 =
2625       std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
2626   if (column_start4x4 >= column_limit4x4) return;
2627 
2628   // The largest reference MV component that can be saved.
2629   constexpr int kRefMvsLimit = (1 << 12) - 1;
2630   const BlockParameters& bp = *block.bp;
2631   ReferenceInfo* reference_info = current_frame_.reference_info();
2632   for (int i = 1; i >= 0; --i) {
2633     const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
2634     if (reference_frame_to_store <= kReferenceFrameIntra) continue;
2635     // Must make a local copy so that StoreMotionFieldMvs() knows there is no
2636     // overlap between load and store.
2637     const MotionVector mv_to_store = bp.mv.mv[i];
2638     const int mv_row = std::abs(mv_to_store.mv[0]);
2639     const int mv_column = std::abs(mv_to_store.mv[1]);
2640     // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two absolute
2641     // values and then compare with kRefMvsLimit to save a branch.
2642     // The next line is equivalent to:
2643     // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
2644     if ((mv_row | mv_column) <= kRefMvsLimit &&
2645         reference_info->relative_distance_from[reference_frame_to_store] < 0) {
2646       const int row_start8x8 = DivideBy2(row_start4x4);
2647       const int row_limit8x8 = DivideBy2(row_limit4x4);
2648       const int column_start8x8 = DivideBy2(column_start4x4);
2649       const int column_limit8x8 = DivideBy2(column_limit4x4);
2650       const int rows = row_limit8x8 - row_start8x8;
2651       const int columns = column_limit8x8 - column_start8x8;
2652       const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
2653       ReferenceFrameType* const reference_frame_row_start =
2654           &reference_info
2655                ->motion_field_reference_frame[row_start8x8][column_start8x8];
2656       MotionVector* const mv =
2657           &reference_info->motion_field_mv[row_start8x8][column_start8x8];
2658 
2659       // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
2660       // and simplifies std::fill() for these cases.
2661       if (columns <= 1) {
2662         // Don't change the above condition to (columns == 1).
2663         // Condition (columns <= 1) may help the compiler simplify the inlining
2664         // of the general case of StoreMotionFieldMvs() by eliminating the
2665         // (columns == 0) case.
2666         assert(columns == 1);
2667         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2668                             1, reference_frame_row_start, mv);
2669       } else if (columns == 2) {
2670         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2671                             2, reference_frame_row_start, mv);
2672       } else if (columns == 4) {
2673         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2674                             4, reference_frame_row_start, mv);
2675       } else if (columns == 8) {
2676         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2677                             8, reference_frame_row_start, mv);
2678       } else if (columns == 16) {
2679         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2680                             16, reference_frame_row_start, mv);
2681       } else if (columns < 16) {
2682         // This always true condition (columns < 16) may help the compiler
2683         // simplify the inlining of the following function.
2684         // This general case is rare and usually only happens to the blocks
2685         // which contain the right boundary of the frame.
2686         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2687                             columns, reference_frame_row_start, mv);
2688       } else {
2689         assert(false);
2690       }
2691       return;
2692     }
2693   }
2694 }
2695 
2696 }  // namespace libgav1
2697