1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <algorithm>
16 #include <atomic>
17 #include <cassert>
18 
19 #include "src/post_filter.h"
20 #include "src/utils/blocking_counter.h"
21 #include "src/utils/compiler_attributes.h"
22 #include "src/utils/constants.h"
23 
24 namespace libgav1 {
25 namespace {
26 
27 constexpr int kStep64x64 = 16;  // =64/4.
28 constexpr int kCdefSkip = 8;
29 
30 constexpr uint8_t kCdefUvDirection[2][2][8] = {
31     {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
32     {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
33 
34 constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
35 
36 template <typename Pixel>
CopyRowForCdef(const Pixel * src,int block_width,int unit_width,bool is_frame_left,bool is_frame_right,uint16_t * const dst,const Pixel * left_border=nullptr)37 void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
38                     bool is_frame_left, bool is_frame_right,
39                     uint16_t* const dst, const Pixel* left_border = nullptr) {
40   if (sizeof(src[0]) == sizeof(dst[0])) {
41     if (is_frame_left) {
42       Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
43     } else if (left_border == nullptr) {
44       memcpy(dst - kCdefBorder, src - kCdefBorder,
45              kCdefBorder * sizeof(dst[0]));
46     } else {
47       memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
48     }
49     memcpy(dst, src, block_width * sizeof(dst[0]));
50     if (is_frame_right) {
51       Memset(dst + block_width, kCdefLargeValue,
52              unit_width + kCdefBorder - block_width);
53     } else {
54       memcpy(dst + block_width, src + block_width,
55              (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
56     }
57     return;
58   }
59   if (is_frame_left) {
60     for (int x = -kCdefBorder; x < 0; ++x) {
61       dst[x] = static_cast<uint16_t>(kCdefLargeValue);
62     }
63   } else if (left_border == nullptr) {
64     for (int x = -kCdefBorder; x < 0; ++x) {
65       dst[x] = src[x];
66     }
67   } else {
68     for (int x = -kCdefBorder; x < 0; ++x) {
69       dst[x] = left_border[x + kCdefBorder];
70     }
71   }
72   for (int x = 0; x < block_width; ++x) {
73     dst[x] = src[x];
74   }
75   for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
76     dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
77   }
78 }
79 
80 // GCC 13.x will report a false positive from the call to
81 // ApplyCdefForOneSuperBlockRowHelper() with a nullptr in
82 // ApplyCdefForOneSuperBlockRow(). The call to CopyPixels() in
83 // ApplyCdefForOneUnit() is only made when thread_pool_ != nullptr and
84 // border_columns[][] is a valid pointer.
85 #if defined(__GNUC__) && !defined(__clang__)
86 #pragma GCC diagnostic push
87 #pragma GCC diagnostic ignored "-Warray-bounds"
88 #pragma GCC diagnostic ignored "-Wstringop-overflow"
89 #endif
90 // For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
91 // |dst|.
CopyPixels(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width,int height,size_t pixel_size)92 void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
93                 int dst_stride, int width, int height, size_t pixel_size) {
94   assert(src != nullptr);
95   assert(dst != nullptr);
96   assert(height > 0);
97   int y = height;
98   do {
99     memcpy(dst, src, width * pixel_size);
100     src += src_stride;
101     dst += dst_stride;
102   } while (--y != 0);
103 }
104 #if defined(__GNUC__) && !defined(__clang__)
105 #pragma GCC diagnostic pop
106 #endif
107 
108 }  // namespace
109 
SetupCdefBorder(int row4x4)110 void PostFilter::SetupCdefBorder(int row4x4) {
111   assert(row4x4 >= 0);
112   assert(DoCdef());
113   int plane = kPlaneY;
114   do {
115     const ptrdiff_t src_stride = frame_buffer_.stride(plane);
116     const ptrdiff_t dst_stride = cdef_border_.stride(plane);
117     const int row_offset = DivideBy4(row4x4);
118     const int num_pixels = SubsampledValue(
119         MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
120     const int row_width = num_pixels << pixel_size_log2_;
121     const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
122                                              subsampling_y_[plane]);
123     for (int i = 0; i < 4; ++i) {
124       const int row = kCdefBorderRows[subsampling_y_[plane]][i];
125       const int absolute_row =
126           (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
127       if (absolute_row >= plane_height) break;
128       const uint8_t* src =
129           GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
130           row * src_stride;
131       uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
132       memcpy(dst, src, row_width);
133     }
134   } while (++plane < planes_);
135 }
136 
137 template <typename Pixel>
PrepareCdefBlock(int block_width4x4,int block_height4x4,int row4x4,int column4x4,uint16_t * cdef_source,ptrdiff_t cdef_stride,const bool y_plane,const uint8_t border_columns[kMaxPlanes][256],bool use_border_columns)138 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
139                                   int row4x4, int column4x4,
140                                   uint16_t* cdef_source, ptrdiff_t cdef_stride,
141                                   const bool y_plane,
142                                   const uint8_t border_columns[kMaxPlanes][256],
143                                   bool use_border_columns) {
144   assert(y_plane || planes_ == kMaxPlanes);
145   const int max_planes = y_plane ? 1 : kMaxPlanes;
146   const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
147   const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
148   const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
149   const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
150   const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
151   const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
152   const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
153   const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
154   // unit_width, unit_height are the same as block_width, block_height unless
155   // it reaches the frame boundary, where block_width < 64 or
156   // block_height < 64. unit_width, unit_height guarantee we build blocks on
157   // a multiple of 8.
158   const int unit_width = Align(block_width, 8 >> subsampling_x);
159   const int unit_height = Align(block_height, 8 >> subsampling_y);
160   const bool is_frame_left = column4x4 == 0;
161   const bool is_frame_right = start_x + block_width >= plane_width;
162   const bool is_frame_top = row4x4 == 0;
163   const bool is_frame_bottom = start_y + block_height >= plane_height;
164   const int y_offset = is_frame_top ? 0 : kCdefBorder;
165   const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
166 
167   for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
168     uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
169                                            kCdefUnitSizeWithBorders *
170                                            kCdefUnitSizeWithBorders;
171     const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
172     const Pixel* src_buffer =
173         reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
174         (start_y - y_offset) * src_stride + start_x;
175     const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
176     const Pixel* cdef_border =
177         (thread_pool_ == nullptr)
178             ? nullptr
179             : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
180                   cdef_border_row_offset * cdef_border_stride + start_x;
181 
182     // All the copying code will use negative indices for populating the left
183     // border. So the starting point is set to kCdefBorder.
184     cdef_src += kCdefBorder;
185 
186     // Copy the top 2 rows as follows;
187     // If is_frame_top is true, both the rows are set to kCdefLargeValue.
188     // Otherwise:
189     //   If multi-threaded filtering is off, the rows are copied from
190     //   |src_buffer|.
191     //   Otherwise, the rows are copied from |cdef_border|.
192     if (is_frame_top) {
193       for (int y = 0; y < kCdefBorder; ++y) {
194         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
195                unit_width + 2 * kCdefBorder);
196         cdef_src += cdef_stride;
197       }
198     } else {
199       const Pixel* top_border =
200           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
201       const int top_border_stride =
202           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
203       for (int y = 0; y < kCdefBorder; ++y) {
204         CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
205                        is_frame_right, cdef_src);
206         top_border += top_border_stride;
207         cdef_src += cdef_stride;
208         // We need to increment |src_buffer| and |cdef_border| in this loop to
209         // set them up for the subsequent loops below.
210         src_buffer += src_stride;
211         cdef_border += cdef_border_stride;
212       }
213     }
214 
215     // Copy the body as follows;
216     // If multi-threaded filtering is off or if is_frame_bottom is true, all the
217     // rows are copied from |src_buffer|.
218     // Otherwise, the first |block_height|-kCdefBorder rows are copied from
219     // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
220     int y = block_height;
221     const int y_threshold =
222         (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
223     const Pixel* left_border =
224         (thread_pool_ == nullptr || !use_border_columns)
225             ? nullptr
226             : reinterpret_cast<const Pixel*>(border_columns[plane]);
227     do {
228       CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
229                      is_frame_right, cdef_src, left_border);
230       cdef_src += cdef_stride;
231       src_buffer += src_stride;
232       if (left_border != nullptr) left_border += kCdefBorder;
233     } while (--y != y_threshold);
234 
235     if (y > 0) {
236       assert(y == kCdefBorder);
237       // |cdef_border| now points to the top 2 rows of the current block. For
238       // the next loop, we need it to point to the bottom 2 rows of the
239       // current block. So increment it by 2 rows.
240       cdef_border += MultiplyBy2(cdef_border_stride);
241       for (int i = 0; i < kCdefBorder; ++i) {
242         CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
243                        is_frame_right, cdef_src);
244         cdef_src += cdef_stride;
245         cdef_border += cdef_border_stride;
246       }
247     }
248 
249     // Copy the bottom 2 rows as follows;
250     // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
251     // Otherwise:
252     //   If multi-threaded filtering is off, the rows are copied from
253     //   |src_buffer|.
254     //   Otherwise, the rows are copied from |cdef_border|.
255     y = 0;
256     if (is_frame_bottom) {
257       do {
258         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
259                unit_width + 2 * kCdefBorder);
260         cdef_src += cdef_stride;
261       } while (++y < kCdefBorder + unit_height - block_height);
262     } else {
263       const Pixel* bottom_border =
264           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
265       const int bottom_border_stride =
266           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
267       do {
268         CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
269                        is_frame_right, cdef_src);
270         bottom_border += bottom_border_stride;
271         cdef_src += cdef_stride;
272       } while (++y < kCdefBorder + unit_height - block_height);
273     }
274   }
275 }
276 
277 template <typename Pixel>
ApplyCdefForOneUnit(uint16_t * cdef_block,const int index,const int block_width4x4,const int block_height4x4,const int row4x4_start,const int column4x4_start,uint8_t border_columns[2][kMaxPlanes][256],bool use_border_columns[2][2])278 void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
279                                      const int block_width4x4,
280                                      const int block_height4x4,
281                                      const int row4x4_start,
282                                      const int column4x4_start,
283                                      uint8_t border_columns[2][kMaxPlanes][256],
284                                      bool use_border_columns[2][2]) {
285   // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
286   static constexpr int kStep = 8;
287   static constexpr int kStep4x4 = 2;
288 
289   int cdef_buffer_row_base_stride[kMaxPlanes];
290   uint8_t* cdef_buffer_row_base[kMaxPlanes];
291   int src_buffer_row_base_stride[kMaxPlanes];
292   const uint8_t* src_buffer_row_base[kMaxPlanes];
293   const uint16_t* cdef_src_row_base[kMaxPlanes];
294   int cdef_src_row_base_stride[kMaxPlanes];
295   int column_step[kMaxPlanes];
296   assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
297   int plane = kPlaneY;
298   do {
299     cdef_buffer_row_base[plane] =
300         GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
301     cdef_buffer_row_base_stride[plane] =
302         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
303     src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
304                                                  row4x4_start, column4x4_start);
305     src_buffer_row_base_stride[plane] =
306         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
307     cdef_src_row_base[plane] =
308         cdef_block +
309         static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
310             kCdefUnitSizeWithBorders +
311         kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
312     cdef_src_row_base_stride[plane] =
313         kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
314     column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
315   } while (++plane < planes_);
316 
317   // |border_columns| contains two buffers. In each call to this function, we
318   // will use one of them as the "destination" for the current call. And the
319   // other one as the "source" for the current call (which would have been the
320   // "destination" of the previous call). We will use the src_index to populate
321   // the borders which were backed up in the previous call. We will use the
322   // dst_index to populate the borders to be used in the next call.
323   const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
324   const int border_columns_dst_index = border_columns_src_index ^ 1;
325 
326   if (index == -1) {
327     if (thread_pool_ == nullptr) {
328       int plane = kPlaneY;
329       do {
330         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
331                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
332                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
333                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
334                    sizeof(Pixel));
335       } while (++plane < planes_);
336     }
337     use_border_columns[border_columns_dst_index][0] = false;
338     use_border_columns[border_columns_dst_index][1] = false;
339     return;
340   }
341 
342   const bool is_frame_right =
343       MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
344   if (!is_frame_right && thread_pool_ != nullptr) {
345     // Backup the last 2 columns for use in the next iteration.
346     use_border_columns[border_columns_dst_index][0] = true;
347     const uint8_t* src_line =
348         GetSourceBuffer(kPlaneY, row4x4_start,
349                         column4x4_start + block_width4x4) -
350         kCdefBorder * sizeof(Pixel);
351     assert(border_columns != nullptr);
352     CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
353                border_columns[border_columns_dst_index][kPlaneY],
354                kCdefBorder * sizeof(Pixel), kCdefBorder,
355                MultiplyBy4(block_height4x4), sizeof(Pixel));
356   }
357 
358   PrepareCdefBlock<Pixel>(
359       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
360       cdef_block, kCdefUnitSizeWithBorders, true,
361       (border_columns != nullptr) ? border_columns[border_columns_src_index]
362                                   : nullptr,
363       use_border_columns[border_columns_src_index][0]);
364 
365   // Stored direction used during the u/v pass.  If bit 3 is set, then block is
366   // a skip.
367   uint8_t direction_y[8 * 8];
368   int y_index = 0;
369 
370   const uint8_t y_primary_strength =
371       frame_header_.cdef.y_primary_strength[index];
372   const uint8_t y_secondary_strength =
373       frame_header_.cdef.y_secondary_strength[index];
374   // y_strength_index is 0 for both primary and secondary strengths being
375   // non-zero, 1 for primary only, 2 for secondary only. This will be updated
376   // with y_primary_strength after variance is applied.
377   int y_strength_index = static_cast<int>(y_secondary_strength == 0);
378 
379   const bool compute_direction_and_variance =
380       (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
381   const uint8_t* skip_row =
382       &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
383   const int skip_stride = cdef_skip_.columns();
384   int row4x4 = row4x4_start;
385   do {
386     uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
387     const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
388     const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
389     int column4x4 = column4x4_start;
390 
391     if (*skip_row == 0) {
392       for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
393         direction_y[y_index] = kCdefSkip;
394       }
395       if (thread_pool_ == nullptr) {
396         CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
397                    cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
398                    sizeof(Pixel));
399       }
400     } else {
401       do {
402         const int block_width = kStep;
403         const int block_height = kStep;
404         const int cdef_stride = frame_buffer_.stride(kPlaneY);
405         uint8_t* const cdef_buffer = cdef_buffer_base;
406         const uint16_t* const cdef_src = cdef_src_base;
407         const int src_stride = frame_buffer_.stride(kPlaneY);
408         const uint8_t* const src_buffer = src_buffer_base;
409 
410         const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
411         const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
412         if (skip) {  // No cdef filtering.
413           direction_y[y_index] = kCdefSkip;
414           if (thread_pool_ == nullptr) {
415             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
416                        block_width, block_height, sizeof(Pixel));
417           }
418         } else {
419           // Zero out residual skip flag.
420           direction_y[y_index] = 0;
421 
422           int variance = 0;
423           if (compute_direction_and_variance) {
424             if (thread_pool_ == nullptr ||
425                 row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
426               dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
427                                   &variance);
428             } else if (sizeof(Pixel) == 2) {
429               dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
430                                   &direction_y[y_index], &variance);
431             } else {
432               // If we are in the last row4x4 for this unit, then the last two
433               // input rows have to come from |cdef_border_|. Since we already
434               // have |cdef_src| populated correctly, use that as the input
435               // for the direction process.
436               uint8_t direction_src[8][8];
437               const uint16_t* cdef_src_line = cdef_src;
438               for (auto& direction_src_line : direction_src) {
439                 for (int i = 0; i < 8; ++i) {
440                   direction_src_line[i] = cdef_src_line[i];
441                 }
442                 cdef_src_line += kCdefUnitSizeWithBorders;
443               }
444               dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
445                                   &variance);
446             }
447           }
448           const int direction =
449               (y_primary_strength == 0) ? 0 : direction_y[y_index];
450           const int variance_strength =
451               ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
452                                      : 0;
453           const uint8_t primary_strength =
454               (variance != 0)
455                   ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
456                   : 0;
457           if ((primary_strength | y_secondary_strength) == 0) {
458             if (thread_pool_ == nullptr) {
459               CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
460                          block_width, block_height, sizeof(Pixel));
461             }
462           } else {
463             const int strength_index =
464                 y_strength_index |
465                 (static_cast<int>(primary_strength == 0) << 1);
466             dsp_.cdef_filters[1][strength_index](
467                 cdef_src, kCdefUnitSizeWithBorders, block_height,
468                 primary_strength, y_secondary_strength,
469                 frame_header_.cdef.damping, direction, cdef_buffer,
470                 cdef_stride);
471           }
472         }
473         cdef_buffer_base += column_step[kPlaneY];
474         src_buffer_base += column_step[kPlaneY];
475         cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
476 
477         column4x4 += kStep4x4;
478         y_index++;
479       } while (column4x4 < column4x4_start + block_width4x4);
480     }
481 
482     cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
483     src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
484     cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
485     skip_row += skip_stride;
486     row4x4 += kStep4x4;
487   } while (row4x4 < row4x4_start + block_height4x4);
488 
489   if (planes_ == kMaxPlanesMonochrome) {
490     return;
491   }
492 
493   const uint8_t uv_primary_strength =
494       frame_header_.cdef.uv_primary_strength[index];
495   const uint8_t uv_secondary_strength =
496       frame_header_.cdef.uv_secondary_strength[index];
497 
498   if ((uv_primary_strength | uv_secondary_strength) == 0) {
499     if (thread_pool_ == nullptr) {
500       for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
501         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
502                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
503                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
504                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
505                    sizeof(Pixel));
506       }
507     }
508     use_border_columns[border_columns_dst_index][1] = false;
509     return;
510   }
511 
512   if (!is_frame_right && thread_pool_ != nullptr) {
513     use_border_columns[border_columns_dst_index][1] = true;
514     for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
515       // Backup the last 2 columns for use in the next iteration.
516       const uint8_t* src_line =
517           GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
518                           column4x4_start + block_width4x4) -
519           kCdefBorder * sizeof(Pixel);
520       CopyPixels(src_line, frame_buffer_.stride(plane),
521                  border_columns[border_columns_dst_index][plane],
522                  kCdefBorder * sizeof(Pixel), kCdefBorder,
523                  MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
524                  sizeof(Pixel));
525     }
526   }
527 
528   PrepareCdefBlock<Pixel>(
529       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
530       cdef_block, kCdefUnitSizeWithBorders, false,
531       (border_columns != nullptr) ? border_columns[border_columns_src_index]
532                                   : nullptr,
533       use_border_columns[border_columns_src_index][1]);
534 
535   // uv_strength_index is 0 for both primary and secondary strengths being
536   // non-zero, 1 for primary only, 2 for secondary only.
537   const int uv_strength_index =
538       (static_cast<int>(uv_primary_strength == 0) << 1) |
539       static_cast<int>(uv_secondary_strength == 0);
540   for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
541     const int8_t subsampling_x = subsampling_x_[plane];
542     const int8_t subsampling_y = subsampling_y_[plane];
543     const int block_width = kStep >> subsampling_x;
544     const int block_height = kStep >> subsampling_y;
545     int row4x4 = row4x4_start;
546 
547     y_index = 0;
548     do {
549       uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
550       const uint8_t* src_buffer_base = src_buffer_row_base[plane];
551       const uint16_t* cdef_src_base = cdef_src_row_base[plane];
552       int column4x4 = column4x4_start;
553       do {
554         const int cdef_stride = frame_buffer_.stride(plane);
555         uint8_t* const cdef_buffer = cdef_buffer_base;
556         const int src_stride = frame_buffer_.stride(plane);
557         const uint8_t* const src_buffer = src_buffer_base;
558         const uint16_t* const cdef_src = cdef_src_base;
559         const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
560         int dual_cdef = 0;
561 
562         if (skip) {  // No cdef filtering.
563           if (thread_pool_ == nullptr) {
564             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
565                        block_width, block_height, sizeof(Pixel));
566           }
567         } else {
568           // Make sure block pair is not out of bounds.
569           if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
570             // Enable dual processing if subsampling_x is 1.
571             dual_cdef = subsampling_x;
572           }
573 
574           int direction = (uv_primary_strength == 0)
575                               ? 0
576                               : kCdefUvDirection[subsampling_x][subsampling_y]
577                                                 [direction_y[y_index]];
578 
579           if (dual_cdef != 0) {
580             if (uv_primary_strength &&
581                 direction_y[y_index] != direction_y[y_index + 1]) {
582               // Disable dual processing if the second block of the pair does
583               // not have the same direction.
584               dual_cdef = 0;
585             }
586 
587             // Disable dual processing if the second block of the pair is a
588             // skip.
589             if (direction_y[y_index + 1] == kCdefSkip) {
590               dual_cdef = 0;
591             }
592           }
593 
594           // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
595           const int width_index = dual_cdef | (subsampling_x ^ 1);
596           dsp_.cdef_filters[width_index][uv_strength_index](
597               cdef_src, kCdefUnitSizeWithBorders, block_height,
598               uv_primary_strength, uv_secondary_strength,
599               frame_header_.cdef.damping - 1, direction, cdef_buffer,
600               cdef_stride);
601         }
602         // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
603         // so adjust the pointers and indexes for 2 blocks.
604         cdef_buffer_base += column_step[plane] << dual_cdef;
605         src_buffer_base += column_step[plane] << dual_cdef;
606         cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
607         column4x4 += kStep4x4 << dual_cdef;
608         y_index += 1 << dual_cdef;
609       } while (column4x4 < column4x4_start + block_width4x4);
610 
611       cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
612       src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
613       cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
614       row4x4 += kStep4x4;
615     } while (row4x4 < row4x4_start + block_height4x4);
616   }
617 }
618 
ApplyCdefForOneSuperBlockRowHelper(uint16_t * cdef_block,uint8_t border_columns[2][kMaxPlanes][256],int row4x4,int block_height4x4)619 void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
620     uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
621     int row4x4, int block_height4x4) {
622   bool use_border_columns[2][2] = {};
623   const bool non_zero_index = frame_header_.cdef.bits > 0;
624   const int8_t* cdef_index =
625       non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
626   int column4x4 = 0;
627   do {
628     const int index = non_zero_index ? *cdef_index++ : 0;
629     const int block_width4x4 =
630         std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
631 
632 #if LIBGAV1_MAX_BITDEPTH >= 10
633     if (bitdepth_ >= 10) {
634       ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
635                                     block_height4x4, row4x4, column4x4,
636                                     border_columns, use_border_columns);
637     } else  // NOLINT
638 #endif      // LIBGAV1_MAX_BITDEPTH >= 10
639     {
640       ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
641                                    block_height4x4, row4x4, column4x4,
642                                    border_columns, use_border_columns);
643     }
644     column4x4 += kStep64x64;
645   } while (column4x4 < frame_header_.columns4x4);
646 }
647 
ApplyCdefForOneSuperBlockRow(int row4x4_start,int sb4x4,bool is_last_row)648 void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
649                                               bool is_last_row) {
650   assert(row4x4_start >= 0);
651   assert(DoCdef());
652   int row4x4 = row4x4_start;
653   const int row4x4_limit = row4x4_start + sb4x4;
654   do {
655     if (row4x4 >= frame_header_.rows4x4) return;
656 
657     // Apply cdef for the last 8 rows of the previous superblock row.
658     // One exception: If the superblock size is 128x128 and is_last_row is true,
659     // then we simply apply cdef for the entire superblock row without any lag.
660     // In that case, apply cdef for the previous superblock row only during the
661     // first iteration (row4x4 == row4x4_start).
662     if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
663       assert(row4x4 >= 16);
664       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
665     }
666 
667     // Apply cdef for the current superblock row. If this is the last superblock
668     // row we apply cdef for all the rows, otherwise we leave out the last 8
669     // rows.
670     const int block_height4x4 =
671         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
672     const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
673     if (height4x4 > 0) {
674       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
675                                          height4x4);
676     }
677     row4x4 += kStep64x64;
678   } while (row4x4 < row4x4_limit);
679 }
680 
ApplyCdefWorker(std::atomic<int> * row4x4_atomic)681 void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
682   int row4x4;
683   uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
684   // Each border_column buffer has to store 64 rows and 2 columns for each
685   // plane. For 10bit, that is 64*2*2 = 256 bytes.
686   alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
687   while ((row4x4 = row4x4_atomic->fetch_add(
688               kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
689     const int block_height4x4 =
690         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
691     ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
692                                        block_height4x4);
693   }
694 }
695 
696 }  // namespace libgav1
697