1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "src/post_filter.h"
15 #include "src/utils/blocking_counter.h"
16 #include "src/utils/compiler_attributes.h"
17 #include "src/utils/constants.h"
18
19 namespace libgav1 {
20 namespace {
21
22 constexpr int kStep64x64 = 16; // =64/4.
23 constexpr int kCdefSkip = 8;
24
25 constexpr uint8_t kCdefUvDirection[2][2][8] = {
26 {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
27 {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
28
29 constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
30
31 template <typename Pixel>
CopyRowForCdef(const Pixel * src,int block_width,int unit_width,bool is_frame_left,bool is_frame_right,uint16_t * const dst,const Pixel * left_border=nullptr)32 void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
33 bool is_frame_left, bool is_frame_right,
34 uint16_t* const dst, const Pixel* left_border = nullptr) {
35 if (sizeof(src[0]) == sizeof(dst[0])) {
36 if (is_frame_left) {
37 Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
38 } else if (left_border == nullptr) {
39 memcpy(dst - kCdefBorder, src - kCdefBorder,
40 kCdefBorder * sizeof(dst[0]));
41 } else {
42 memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
43 }
44 memcpy(dst, src, block_width * sizeof(dst[0]));
45 if (is_frame_right) {
46 Memset(dst + block_width, kCdefLargeValue,
47 unit_width + kCdefBorder - block_width);
48 } else {
49 memcpy(dst + block_width, src + block_width,
50 (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
51 }
52 return;
53 }
54 if (is_frame_left) {
55 for (int x = -kCdefBorder; x < 0; ++x) {
56 dst[x] = static_cast<uint16_t>(kCdefLargeValue);
57 }
58 } else if (left_border == nullptr) {
59 for (int x = -kCdefBorder; x < 0; ++x) {
60 dst[x] = src[x];
61 }
62 } else {
63 for (int x = -kCdefBorder; x < 0; ++x) {
64 dst[x] = left_border[x + kCdefBorder];
65 }
66 }
67 for (int x = 0; x < block_width; ++x) {
68 dst[x] = src[x];
69 }
70 for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
71 dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
72 }
73 }
74
75 // For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
76 // |dst|.
CopyPixels(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width,int height,size_t pixel_size)77 void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
78 int dst_stride, int width, int height, size_t pixel_size) {
79 int y = height;
80 do {
81 memcpy(dst, src, width * pixel_size);
82 src += src_stride;
83 dst += dst_stride;
84 } while (--y != 0);
85 }
86
87 } // namespace
88
SetupCdefBorder(int row4x4)89 void PostFilter::SetupCdefBorder(int row4x4) {
90 assert(row4x4 >= 0);
91 assert(DoCdef());
92 int plane = kPlaneY;
93 do {
94 const ptrdiff_t src_stride = frame_buffer_.stride(plane);
95 const ptrdiff_t dst_stride = cdef_border_.stride(plane);
96 const int row_offset = DivideBy4(row4x4);
97 const int num_pixels = SubsampledValue(
98 MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
99 const int row_width = num_pixels << pixel_size_log2_;
100 const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
101 subsampling_y_[plane]);
102 for (int i = 0; i < 4; ++i) {
103 const int row = kCdefBorderRows[subsampling_y_[plane]][i];
104 const int absolute_row =
105 (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
106 if (absolute_row >= plane_height) break;
107 const uint8_t* src =
108 GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
109 row * src_stride;
110 uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
111 memcpy(dst, src, row_width);
112 }
113 } while (++plane < planes_);
114 }
115
116 template <typename Pixel>
PrepareCdefBlock(int block_width4x4,int block_height4x4,int row4x4,int column4x4,uint16_t * cdef_source,ptrdiff_t cdef_stride,const bool y_plane,const uint8_t border_columns[kMaxPlanes][256],bool use_border_columns)117 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
118 int row4x4, int column4x4,
119 uint16_t* cdef_source, ptrdiff_t cdef_stride,
120 const bool y_plane,
121 const uint8_t border_columns[kMaxPlanes][256],
122 bool use_border_columns) {
123 assert(y_plane || planes_ == kMaxPlanes);
124 const int max_planes = y_plane ? 1 : kMaxPlanes;
125 const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
126 const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
127 const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
128 const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
129 const int plane_width = SubsampledValue(width_, subsampling_x);
130 const int plane_height = SubsampledValue(height_, subsampling_y);
131 const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
132 const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
133 // unit_width, unit_height are the same as block_width, block_height unless
134 // it reaches the frame boundary, where block_width < 64 or
135 // block_height < 64. unit_width, unit_height guarantee we build blocks on
136 // a multiple of 8.
137 const int unit_width = Align(block_width, 8 >> subsampling_x);
138 const int unit_height = Align(block_height, 8 >> subsampling_y);
139 const bool is_frame_left = column4x4 == 0;
140 const bool is_frame_right = start_x + block_width >= plane_width;
141 const bool is_frame_top = row4x4 == 0;
142 const bool is_frame_bottom = start_y + block_height >= plane_height;
143 const int y_offset = is_frame_top ? 0 : kCdefBorder;
144 const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
145
146 for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
147 uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
148 kCdefUnitSizeWithBorders *
149 kCdefUnitSizeWithBorders;
150 const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
151 const Pixel* src_buffer =
152 reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
153 (start_y - y_offset) * src_stride + start_x;
154 const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
155 const Pixel* cdef_border =
156 (thread_pool_ == nullptr)
157 ? nullptr
158 : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
159 cdef_border_row_offset * cdef_border_stride + start_x;
160
161 // All the copying code will use negative indices for populating the left
162 // border. So the starting point is set to kCdefBorder.
163 cdef_src += kCdefBorder;
164
165 // Copy the top 2 rows as follows;
166 // If is_frame_top is true, both the rows are set to kCdefLargeValue.
167 // Otherwise:
168 // If multi-threaded filtering is off, the rows are copied from
169 // |src_buffer|.
170 // Otherwise, the rows are copied from |cdef_border|.
171 if (is_frame_top) {
172 for (int y = 0; y < kCdefBorder; ++y) {
173 Memset(cdef_src - kCdefBorder, kCdefLargeValue,
174 unit_width + 2 * kCdefBorder);
175 cdef_src += cdef_stride;
176 }
177 } else {
178 const Pixel* top_border =
179 (thread_pool_ == nullptr) ? src_buffer : cdef_border;
180 const int top_border_stride =
181 (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
182 for (int y = 0; y < kCdefBorder; ++y) {
183 CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
184 is_frame_right, cdef_src);
185 top_border += top_border_stride;
186 cdef_src += cdef_stride;
187 // We need to increment |src_buffer| and |cdef_border| in this loop to
188 // set them up for the subsequent loops below.
189 src_buffer += src_stride;
190 cdef_border += cdef_border_stride;
191 }
192 }
193
194 // Copy the body as follows;
195 // If multi-threaded filtering is off or if is_frame_bottom is true, all the
196 // rows are copied from |src_buffer|.
197 // Otherwise, the first |block_height|-kCdefBorder rows are copied from
198 // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
199 int y = block_height;
200 const int y_threshold =
201 (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
202 const Pixel* left_border =
203 (thread_pool_ == nullptr || !use_border_columns)
204 ? nullptr
205 : reinterpret_cast<const Pixel*>(border_columns[plane]);
206 do {
207 CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
208 is_frame_right, cdef_src, left_border);
209 cdef_src += cdef_stride;
210 src_buffer += src_stride;
211 if (left_border != nullptr) left_border += kCdefBorder;
212 } while (--y != y_threshold);
213
214 if (y > 0) {
215 assert(y == kCdefBorder);
216 // |cdef_border| now points to the top 2 rows of the current block. For
217 // the next loop, we need it to point to the bottom 2 rows of the
218 // current block. So increment it by 2 rows.
219 cdef_border += MultiplyBy2(cdef_border_stride);
220 for (int i = 0; i < kCdefBorder; ++i) {
221 CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
222 is_frame_right, cdef_src);
223 cdef_src += cdef_stride;
224 cdef_border += cdef_border_stride;
225 }
226 }
227
228 // Copy the bottom 2 rows as follows;
229 // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
230 // Otherwise:
231 // If multi-threaded filtering is off, the rows are copied from
232 // |src_buffer|.
233 // Otherwise, the rows are copied from |cdef_border|.
234 y = 0;
235 if (is_frame_bottom) {
236 do {
237 Memset(cdef_src - kCdefBorder, kCdefLargeValue,
238 unit_width + 2 * kCdefBorder);
239 cdef_src += cdef_stride;
240 } while (++y < kCdefBorder + unit_height - block_height);
241 } else {
242 const Pixel* bottom_border =
243 (thread_pool_ == nullptr) ? src_buffer : cdef_border;
244 const int bottom_border_stride =
245 (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
246 do {
247 CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
248 is_frame_right, cdef_src);
249 bottom_border += bottom_border_stride;
250 cdef_src += cdef_stride;
251 } while (++y < kCdefBorder + unit_height - block_height);
252 }
253 }
254 }
255
256 template <typename Pixel>
ApplyCdefForOneUnit(uint16_t * cdef_block,const int index,const int block_width4x4,const int block_height4x4,const int row4x4_start,const int column4x4_start,uint8_t border_columns[2][kMaxPlanes][256],bool use_border_columns[2][2])257 void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
258 const int block_width4x4,
259 const int block_height4x4,
260 const int row4x4_start,
261 const int column4x4_start,
262 uint8_t border_columns[2][kMaxPlanes][256],
263 bool use_border_columns[2][2]) {
264 // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
265 static constexpr int kStep = 8;
266 static constexpr int kStep4x4 = 2;
267
268 int cdef_buffer_row_base_stride[kMaxPlanes];
269 uint8_t* cdef_buffer_row_base[kMaxPlanes];
270 int src_buffer_row_base_stride[kMaxPlanes];
271 const uint8_t* src_buffer_row_base[kMaxPlanes];
272 const uint16_t* cdef_src_row_base[kMaxPlanes];
273 int cdef_src_row_base_stride[kMaxPlanes];
274 int column_step[kMaxPlanes];
275 assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
276 int plane = kPlaneY;
277 do {
278 cdef_buffer_row_base[plane] =
279 GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
280 cdef_buffer_row_base_stride[plane] =
281 frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
282 src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
283 row4x4_start, column4x4_start);
284 src_buffer_row_base_stride[plane] =
285 frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
286 cdef_src_row_base[plane] =
287 cdef_block +
288 static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
289 kCdefUnitSizeWithBorders +
290 kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
291 cdef_src_row_base_stride[plane] =
292 kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
293 column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
294 } while (++plane < planes_);
295
296 // |border_columns| contains two buffers. In each call to this function, we
297 // will use one of them as the "destination" for the current call. And the
298 // other one as the "source" for the current call (which would have been the
299 // "destination" of the previous call). We will use the src_index to populate
300 // the borders which were backed up in the previous call. We will use the
301 // dst_index to populate the borders to be used in the next call.
302 const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
303 const int border_columns_dst_index = border_columns_src_index ^ 1;
304
305 if (index == -1) {
306 if (thread_pool_ == nullptr) {
307 int plane = kPlaneY;
308 do {
309 CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
310 cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
311 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
312 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
313 sizeof(Pixel));
314 } while (++plane < planes_);
315 }
316 use_border_columns[border_columns_dst_index][0] = false;
317 use_border_columns[border_columns_dst_index][1] = false;
318 return;
319 }
320
321 const bool is_frame_right =
322 MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_;
323 if (!is_frame_right && thread_pool_ != nullptr) {
324 // Backup the last 2 columns for use in the next iteration.
325 use_border_columns[border_columns_dst_index][0] = true;
326 const uint8_t* src_line =
327 GetSourceBuffer(kPlaneY, row4x4_start,
328 column4x4_start + block_width4x4) -
329 kCdefBorder * sizeof(Pixel);
330 CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
331 border_columns[border_columns_dst_index][kPlaneY],
332 kCdefBorder * sizeof(Pixel), kCdefBorder,
333 MultiplyBy4(block_height4x4), sizeof(Pixel));
334 }
335
336 PrepareCdefBlock<Pixel>(
337 block_width4x4, block_height4x4, row4x4_start, column4x4_start,
338 cdef_block, kCdefUnitSizeWithBorders, true,
339 (border_columns != nullptr) ? border_columns[border_columns_src_index]
340 : nullptr,
341 use_border_columns[border_columns_src_index][0]);
342
343 // Stored direction used during the u/v pass. If bit 3 is set, then block is
344 // a skip.
345 uint8_t direction_y[8 * 8];
346 int y_index = 0;
347
348 const uint8_t y_primary_strength =
349 frame_header_.cdef.y_primary_strength[index];
350 const uint8_t y_secondary_strength =
351 frame_header_.cdef.y_secondary_strength[index];
352 // y_strength_index is 0 for both primary and secondary strengths being
353 // non-zero, 1 for primary only, 2 for secondary only. This will be updated
354 // with y_primary_strength after variance is applied.
355 int y_strength_index = static_cast<int>(y_secondary_strength == 0);
356
357 const bool compute_direction_and_variance =
358 (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
359 BlockParameters* const* bp_row0_base =
360 block_parameters_.Address(row4x4_start, column4x4_start);
361 BlockParameters* const* bp_row1_base =
362 bp_row0_base + block_parameters_.columns4x4();
363 const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
364 int row4x4 = row4x4_start;
365 do {
366 uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
367 const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
368 const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
369 BlockParameters* const* bp0 = bp_row0_base;
370 BlockParameters* const* bp1 = bp_row1_base;
371 int column4x4 = column4x4_start;
372 do {
373 const int block_width = kStep;
374 const int block_height = kStep;
375 const int cdef_stride = frame_buffer_.stride(kPlaneY);
376 uint8_t* const cdef_buffer = cdef_buffer_base;
377 const uint16_t* const cdef_src = cdef_src_base;
378 const int src_stride = frame_buffer_.stride(kPlaneY);
379 const uint8_t* const src_buffer = src_buffer_base;
380
381 const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
382 (*(bp1 + 1))->skip;
383
384 if (skip) { // No cdef filtering.
385 direction_y[y_index] = kCdefSkip;
386 if (thread_pool_ == nullptr) {
387 CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
388 block_width, block_height, sizeof(Pixel));
389 }
390 } else {
391 // Zero out residual skip flag.
392 direction_y[y_index] = 0;
393
394 int variance = 0;
395 if (compute_direction_and_variance) {
396 if (thread_pool_ == nullptr ||
397 row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
398 dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
399 &variance);
400 } else if (sizeof(Pixel) == 2) {
401 dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
402 &direction_y[y_index], &variance);
403 } else {
404 // If we are in the last row4x4 for this unit, then the last two
405 // input rows have to come from |cdef_border_|. Since we already
406 // have |cdef_src| populated correctly, use that as the input
407 // for the direction process.
408 uint8_t direction_src[8][8];
409 const uint16_t* cdef_src_line = cdef_src;
410 for (auto& direction_src_line : direction_src) {
411 for (int i = 0; i < 8; ++i) {
412 direction_src_line[i] = cdef_src_line[i];
413 }
414 cdef_src_line += kCdefUnitSizeWithBorders;
415 }
416 dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
417 &variance);
418 }
419 }
420 const int direction =
421 (y_primary_strength == 0) ? 0 : direction_y[y_index];
422 const int variance_strength =
423 ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
424 const uint8_t primary_strength =
425 (variance != 0)
426 ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
427 : 0;
428 if ((primary_strength | y_secondary_strength) == 0) {
429 if (thread_pool_ == nullptr) {
430 CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
431 block_width, block_height, sizeof(Pixel));
432 }
433 } else {
434 const int strength_index =
435 y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
436 dsp_.cdef_filters[1][strength_index](
437 cdef_src, kCdefUnitSizeWithBorders, block_height,
438 primary_strength, y_secondary_strength,
439 frame_header_.cdef.damping, direction, cdef_buffer, cdef_stride);
440 }
441 }
442 cdef_buffer_base += column_step[kPlaneY];
443 src_buffer_base += column_step[kPlaneY];
444 cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
445
446 bp0 += kStep4x4;
447 bp1 += kStep4x4;
448 column4x4 += kStep4x4;
449 y_index++;
450 } while (column4x4 < column4x4_start + block_width4x4);
451
452 cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
453 src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
454 cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
455 bp_row0_base += bp_stride;
456 bp_row1_base += bp_stride;
457 row4x4 += kStep4x4;
458 } while (row4x4 < row4x4_start + block_height4x4);
459
460 if (planes_ == kMaxPlanesMonochrome) {
461 return;
462 }
463
464 const uint8_t uv_primary_strength =
465 frame_header_.cdef.uv_primary_strength[index];
466 const uint8_t uv_secondary_strength =
467 frame_header_.cdef.uv_secondary_strength[index];
468
469 if ((uv_primary_strength | uv_secondary_strength) == 0) {
470 if (thread_pool_ == nullptr) {
471 for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
472 CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
473 cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
474 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
475 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
476 sizeof(Pixel));
477 }
478 }
479 use_border_columns[border_columns_dst_index][1] = false;
480 return;
481 }
482
483 if (!is_frame_right && thread_pool_ != nullptr) {
484 use_border_columns[border_columns_dst_index][1] = true;
485 for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
486 // Backup the last 2 columns for use in the next iteration.
487 const uint8_t* src_line =
488 GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
489 column4x4_start + block_width4x4) -
490 kCdefBorder * sizeof(Pixel);
491 CopyPixels(src_line, frame_buffer_.stride(plane),
492 border_columns[border_columns_dst_index][plane],
493 kCdefBorder * sizeof(Pixel), kCdefBorder,
494 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
495 sizeof(Pixel));
496 }
497 }
498
499 PrepareCdefBlock<Pixel>(
500 block_width4x4, block_height4x4, row4x4_start, column4x4_start,
501 cdef_block, kCdefUnitSizeWithBorders, false,
502 (border_columns != nullptr) ? border_columns[border_columns_src_index]
503 : nullptr,
504 use_border_columns[border_columns_src_index][1]);
505
506 // uv_strength_index is 0 for both primary and secondary strengths being
507 // non-zero, 1 for primary only, 2 for secondary only.
508 const int uv_strength_index =
509 (static_cast<int>(uv_primary_strength == 0) << 1) |
510 static_cast<int>(uv_secondary_strength == 0);
511 for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
512 const int8_t subsampling_x = subsampling_x_[plane];
513 const int8_t subsampling_y = subsampling_y_[plane];
514 const int block_width = kStep >> subsampling_x;
515 const int block_height = kStep >> subsampling_y;
516 int row4x4 = row4x4_start;
517
518 y_index = 0;
519 do {
520 uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
521 const uint8_t* src_buffer_base = src_buffer_row_base[plane];
522 const uint16_t* cdef_src_base = cdef_src_row_base[plane];
523 int column4x4 = column4x4_start;
524 do {
525 const int cdef_stride = frame_buffer_.stride(plane);
526 uint8_t* const cdef_buffer = cdef_buffer_base;
527 const int src_stride = frame_buffer_.stride(plane);
528 const uint8_t* const src_buffer = src_buffer_base;
529 const uint16_t* const cdef_src = cdef_src_base;
530 const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
531 int dual_cdef = 0;
532
533 if (skip) { // No cdef filtering.
534 if (thread_pool_ == nullptr) {
535 CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
536 block_width, block_height, sizeof(Pixel));
537 }
538 } else {
539 // Make sure block pair is not out of bounds.
540 if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
541 // Enable dual processing if subsampling_x is 1.
542 dual_cdef = subsampling_x;
543 }
544
545 int direction = (uv_primary_strength == 0)
546 ? 0
547 : kCdefUvDirection[subsampling_x][subsampling_y]
548 [direction_y[y_index]];
549
550 if (dual_cdef != 0) {
551 if (uv_primary_strength &&
552 direction_y[y_index] != direction_y[y_index + 1]) {
553 // Disable dual processing if the second block of the pair does
554 // not have the same direction.
555 dual_cdef = 0;
556 }
557
558 // Disable dual processing if the second block of the pair is a
559 // skip.
560 if (direction_y[y_index + 1] == kCdefSkip) {
561 dual_cdef = 0;
562 }
563 }
564
565 // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
566 const int width_index = dual_cdef | (subsampling_x ^ 1);
567 dsp_.cdef_filters[width_index][uv_strength_index](
568 cdef_src, kCdefUnitSizeWithBorders, block_height,
569 uv_primary_strength, uv_secondary_strength,
570 frame_header_.cdef.damping - 1, direction, cdef_buffer,
571 cdef_stride);
572 }
573 // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
574 // so adjust the pointers and indexes for 2 blocks.
575 cdef_buffer_base += column_step[plane] << dual_cdef;
576 src_buffer_base += column_step[plane] << dual_cdef;
577 cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
578 column4x4 += kStep4x4 << dual_cdef;
579 y_index += 1 << dual_cdef;
580 } while (column4x4 < column4x4_start + block_width4x4);
581
582 cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
583 src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
584 cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
585 row4x4 += kStep4x4;
586 } while (row4x4 < row4x4_start + block_height4x4);
587 }
588 }
589
ApplyCdefForOneSuperBlockRowHelper(uint16_t * cdef_block,uint8_t border_columns[2][kMaxPlanes][256],int row4x4,int block_height4x4)590 void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
591 uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
592 int row4x4, int block_height4x4) {
593 bool use_border_columns[2][2] = {};
594 for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
595 column4x4 += kStep64x64) {
596 const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
597 const int block_width4x4 =
598 std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
599
600 #if LIBGAV1_MAX_BITDEPTH >= 10
601 if (bitdepth_ >= 10) {
602 ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
603 block_height4x4, row4x4, column4x4,
604 border_columns, use_border_columns);
605 continue;
606 }
607 #endif // LIBGAV1_MAX_BITDEPTH >= 10
608 ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
609 block_height4x4, row4x4, column4x4,
610 border_columns, use_border_columns);
611 }
612 }
613
ApplyCdefForOneSuperBlockRow(int row4x4_start,int sb4x4,bool is_last_row)614 void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
615 bool is_last_row) {
616 assert(row4x4_start >= 0);
617 assert(DoCdef());
618 for (int y = 0; y < sb4x4; y += kStep64x64) {
619 const int row4x4 = row4x4_start + y;
620 if (row4x4 >= frame_header_.rows4x4) return;
621
622 // Apply cdef for the last 8 rows of the previous superblock row.
623 // One exception: If the superblock size is 128x128 and is_last_row is true,
624 // then we simply apply cdef for the entire superblock row without any lag.
625 // In that case, apply cdef for the previous superblock row only during the
626 // first iteration (y == 0).
627 if (row4x4 > 0 && (!is_last_row || y == 0)) {
628 assert(row4x4 >= 16);
629 ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
630 }
631
632 // Apply cdef for the current superblock row. If this is the last superblock
633 // row we apply cdef for all the rows, otherwise we leave out the last 8
634 // rows.
635 const int block_height4x4 =
636 std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
637 const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
638 if (height4x4 > 0) {
639 ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
640 height4x4);
641 }
642 }
643 }
644
ApplyCdefWorker(std::atomic<int> * row4x4_atomic)645 void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
646 int row4x4;
647 uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
648 // Each border_column buffer has to store 64 rows and 2 columns for each
649 // plane. For 10bit, that is 64*2*2 = 256 bytes.
650 alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
651 while ((row4x4 = row4x4_atomic->fetch_add(
652 kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
653 const int block_height4x4 =
654 std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
655 ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
656 block_height4x4);
657 }
658 }
659
660 } // namespace libgav1
661