1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 *
11 */
12
13 #include <math.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18
19 #include "aom_mem/aom_mem.h"
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/resize.h"
22 #include "av1/common/restoration.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_mem/aom_mem.h"
25
26 #include "aom_ports/mem.h"
27
28 // The 's' values are calculated based on original 'r' and 'e' values in the
29 // spec using GenSgrprojVtable().
30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31 const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
32 { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33 { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
34 { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
35 { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
36 { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
37 { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
38 { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
39 { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
40 };
41
av1_whole_frame_rect(const AV1_COMMON * cm,int is_uv)42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43 AV1PixelRect rect;
44
45 int ss_x = is_uv && cm->seq_params.subsampling_x;
46 int ss_y = is_uv && cm->seq_params.subsampling_y;
47
48 rect.top = 0;
49 rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50 rect.left = 0;
51 rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52 return rect;
53 }
54
55 // Count horizontal or vertical units per tile (use a width or height for
56 // tile_size, respectively). We basically want to divide the tile size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height. The
60 // max with 1 is to deal with tiles that are smaller than half of a restoration
61 // unit.
av1_lr_count_units_in_tile(int unit_size,int tile_size)62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63 return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64 }
65
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67 int is_uv) {
68 // We need to allocate enough space for restoration units to cover the
69 // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70 // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71 // to do the computation ourselves, iterating over the tiles and keeping
72 // track of the largest width and height, then upscaling.
73 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74 const int max_tile_w = tile_rect.right - tile_rect.left;
75 const int max_tile_h = tile_rect.bottom - tile_rect.top;
76
77 // To calculate hpertile and vpertile (horizontal and vertical units per
78 // tile), we basically want to divide the largest tile width or height by the
79 // size of a restoration unit. Rather than rounding up unconditionally as you
80 // might expect, we round to nearest, which models the way a right or bottom
81 // restoration unit can extend to up to 150% its normal width or height. The
82 // max with 1 is to deal with tiles that are smaller than half of a
83 // restoration unit.
84 const int unit_size = rsi->restoration_unit_size;
85 const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86 const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87
88 rsi->units_per_tile = hpertile * vpertile;
89 rsi->horz_units_per_tile = hpertile;
90 rsi->vert_units_per_tile = vpertile;
91
92 const int ntiles = 1;
93 const int nunits = ntiles * rsi->units_per_tile;
94
95 aom_free(rsi->unit_info);
96 CHECK_MEM_ERROR(cm, rsi->unit_info,
97 (RestorationUnitInfo *)aom_memalign(
98 16, sizeof(*rsi->unit_info) * nunits));
99 }
100
av1_free_restoration_struct(RestorationInfo * rst_info)101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
102 aom_free(rst_info->unit_info);
103 rst_info->unit_info = NULL;
104 }
105
106 #if 0
107 // Pair of values for each sgrproj parameter:
108 // Index 0 corresponds to r[0], e[0]
109 // Index 1 corresponds to r[1], e[1]
110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
111
112 static void GenSgrprojVtable() {
113 for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114 const sgr_params_type *const params = &av1_sgr_params[i];
115 for (int j = 0; j < 2; ++j) {
116 const int e = params->e[j];
117 const int r = params->r[j];
118 if (r == 0) { // filter is disabled
119 sgrproj_mtable[i][j] = -1; // mark invalid
120 } else { // filter is enabled
121 const int n = (2 * r + 1) * (2 * r + 1);
122 const int n2e = n * n * e;
123 assert(n2e != 0);
124 sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125 }
126 }
127 }
128 }
129 #endif
130
av1_loop_restoration_precal()131 void av1_loop_restoration_precal() {
132 #if 0
133 GenSgrprojVtable();
134 #endif
135 }
136
extend_frame_lowbd(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138 int border_horz, int border_vert) {
139 uint8_t *data_p;
140 int i;
141 for (i = 0; i < height; ++i) {
142 data_p = data + i * stride;
143 memset(data_p - border_horz, data_p[0], border_horz);
144 memset(data_p + width, data_p[width - 1], border_horz);
145 }
146 data_p = data - border_horz;
147 for (i = -border_vert; i < 0; ++i) {
148 memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149 }
150 for (i = height; i < height + border_vert; ++i) {
151 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152 width + 2 * border_horz);
153 }
154 }
155
156 #if CONFIG_AV1_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)157 static void extend_frame_highbd(uint16_t *data, int width, int height,
158 int stride, int border_horz, int border_vert) {
159 uint16_t *data_p;
160 int i, j;
161 for (i = 0; i < height; ++i) {
162 data_p = data + i * stride;
163 for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
164 for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
165 }
166 data_p = data - border_horz;
167 for (i = -border_vert; i < 0; ++i) {
168 memcpy(data_p + i * stride, data_p,
169 (width + 2 * border_horz) * sizeof(uint16_t));
170 }
171 for (i = height; i < height + border_vert; ++i) {
172 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
173 (width + 2 * border_horz) * sizeof(uint16_t));
174 }
175 }
176
copy_tile_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)177 static void copy_tile_highbd(int width, int height, const uint16_t *src,
178 int src_stride, uint16_t *dst, int dst_stride) {
179 for (int i = 0; i < height; ++i)
180 memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
181 }
182 #endif
183
av1_extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)184 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
185 int border_horz, int border_vert, int highbd) {
186 #if CONFIG_AV1_HIGHBITDEPTH
187 if (highbd) {
188 extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
189 border_horz, border_vert);
190 return;
191 }
192 #endif
193 (void)highbd;
194 extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
195 }
196
copy_tile_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)197 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
198 int src_stride, uint8_t *dst, int dst_stride) {
199 for (int i = 0; i < height; ++i)
200 memcpy(dst + i * dst_stride, src + i * src_stride, width);
201 }
202
copy_tile(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)203 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
204 uint8_t *dst, int dst_stride, int highbd) {
205 #if CONFIG_AV1_HIGHBITDEPTH
206 if (highbd) {
207 copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
208 CONVERT_TO_SHORTPTR(dst), dst_stride);
209 return;
210 }
211 #endif
212 (void)highbd;
213 copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
214 }
215
216 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
217
218 // With striped loop restoration, the filtering for each 64-pixel stripe gets
219 // most of its input from the output of CDEF (stored in data8), but we need to
220 // fill out a border of 3 pixels above/below the stripe according to the
221 // following
222 // rules:
223 //
224 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
225 // This extension is done by a call to av1_extend_frame() at the start of the
226 // loop restoration process, so the value of copy_above/copy_below doesn't
227 // strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
228 // loop filtering across tiles is disabled, we can allow
229 // {setup,restore}_processing_stripe_boundary to assume that the top/bottom
230 // data has always been copied, simplifying the behaviour at the left and
231 // right edges of tiles.
232 //
233 // * If we're at a tile boundary and loop filtering across tiles is enabled,
234 // then there is a logical stripe which is 64 pixels high, but which is split
235 // into an 8px high and a 56px high stripe so that the processing (and
236 // coefficient set usage) can be aligned to tiles.
237 // In this case, we use the 3 rows of CDEF output across the boundary for
238 // context; this corresponds to leaving the frame buffer as-is.
239 //
240 // * If we're at a tile boundary and loop filtering across tiles is disabled,
241 // then we take the outermost row of CDEF pixels *within the current tile*
242 // and copy it three times. Thus we behave exactly as if the tile were a full
243 // frame.
244 //
245 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
246 // take 2 rows of deblocked pixels and extend them to 3 rows of context.
247 //
248 // The distinction between the latter two cases is handled by the
249 // av1_loop_restoration_save_boundary_lines() function, so here we just need
250 // to decide if we're overwriting the above/below boundary pixels or not.
get_stripe_boundary_info(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int ss_y,int * copy_above,int * copy_below)251 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
252 const AV1PixelRect *tile_rect, int ss_y,
253 int *copy_above, int *copy_below) {
254 *copy_above = 1;
255 *copy_below = 1;
256
257 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
258 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
259
260 const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
261 const int this_stripe_height =
262 full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
263 const int last_stripe_in_tile =
264 (limits->v_start + this_stripe_height >= tile_rect->bottom);
265
266 if (first_stripe_in_tile) *copy_above = 0;
267 if (last_stripe_in_tile) *copy_below = 0;
268 }
269
270 // Overwrite the border pixels around a processing stripe so that the conditions
271 // listed above get_stripe_boundary_info() are preserved.
272 // We save the pixels which get overwritten into a temporary buffer, so that
273 // they can be restored by restore_processing_stripe_boundary() after we've
274 // processed the stripe.
275 //
276 // limits gives the rectangular limits of the remaining stripes for the current
277 // restoration unit. rsb is the stored stripe boundaries (taken from either
278 // deblock or CDEF output as necessary).
279 //
280 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
281 // the first stripe in this tile (needed to convert the tile-relative stripe
282 // index we get from limits into something we can look up in rsb).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)283 static void setup_processing_stripe_boundary(
284 const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
285 int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
286 RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
287 // Offsets within the line buffers. The buffer logically starts at column
288 // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
289 // has column x0 in the buffer.
290 const int buf_stride = rsb->stripe_boundary_stride;
291 const int buf_x0_off = limits->h_start;
292 const int line_width =
293 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
294 const int line_size = line_width << use_highbd;
295
296 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
297
298 // Replace RESTORATION_BORDER pixels above the top of the stripe
299 // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
300 // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
301 // duplicating the topmost of the 2 lines (see the AOMMAX call when
302 // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
303 //
304 // Special case: If we're at the top of a tile, which isn't on the topmost
305 // tile row, and we're allowed to loop filter across tiles, then we have a
306 // logical 64-pixel-high stripe which has been split into an 8-pixel high
307 // stripe and a 56-pixel high stripe (the current one). So, in this case,
308 // we want to leave the boundary alone!
309 if (!opt) {
310 if (copy_above) {
311 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
312
313 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
314 const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
315 const int buf_off = buf_x0_off + buf_row * buf_stride;
316 const uint8_t *buf =
317 rsb->stripe_boundary_above + (buf_off << use_highbd);
318 uint8_t *dst8 = data8_tl + i * data_stride;
319 // Save old pixels, then replace with data from stripe_boundary_above
320 memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
321 REAL_PTR(use_highbd, dst8), line_size);
322 memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
323 }
324 }
325
326 // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
327 // The second buffer row is repeated, so src_row gets the values 0, 1, 1
328 // for i = 0, 1, 2.
329 if (copy_below) {
330 const int stripe_end = limits->v_start + h;
331 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
332
333 for (int i = 0; i < RESTORATION_BORDER; ++i) {
334 const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
335 const int buf_off = buf_x0_off + buf_row * buf_stride;
336 const uint8_t *src =
337 rsb->stripe_boundary_below + (buf_off << use_highbd);
338
339 uint8_t *dst8 = data8_bl + i * data_stride;
340 // Save old pixels, then replace with data from stripe_boundary_below
341 memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
342 memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
343 }
344 }
345 } else {
346 if (copy_above) {
347 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
348
349 // Only save and overwrite i=-RESTORATION_BORDER line.
350 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
351 // Save old pixels, then replace with data from stripe_boundary_above
352 memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
353 memcpy(REAL_PTR(use_highbd, dst8),
354 REAL_PTR(use_highbd,
355 data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
356 line_size);
357 }
358
359 if (copy_below) {
360 const int stripe_end = limits->v_start + h;
361 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
362
363 // Only save and overwrite i=2 line.
364 uint8_t *dst8 = data8_bl + 2 * data_stride;
365 // Save old pixels, then replace with data from stripe_boundary_below
366 memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
367 memcpy(REAL_PTR(use_highbd, dst8),
368 REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
369 }
370 }
371 }
372
373 // This function restores the boundary lines modified by
374 // setup_processing_stripe_boundary.
375 //
376 // Note: We need to be careful when handling the corners of the processing
377 // unit, because (eg.) the top-left corner is considered to be part of
378 // both the left and top borders. This means that, depending on the
379 // loop_filter_across_tiles_enabled flag, the corner pixels might get
380 // overwritten twice, once as part of the "top" border and once as part
381 // of the "left" border (or similar for other corners).
382 //
383 // Everything works out fine as long as we make sure to reverse the order
384 // when restoring, ie. we need to restore the left/right borders followed
385 // by the top/bottom borders.
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)386 static void restore_processing_stripe_boundary(
387 const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
388 int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
389 int copy_below, int opt) {
390 const int line_width =
391 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
392 const int line_size = line_width << use_highbd;
393
394 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
395
396 if (!opt) {
397 if (copy_above) {
398 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
399 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
400 uint8_t *dst8 = data8_tl + i * data_stride;
401 memcpy(REAL_PTR(use_highbd, dst8),
402 rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
403 }
404 }
405
406 if (copy_below) {
407 const int stripe_bottom = limits->v_start + h;
408 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
409
410 for (int i = 0; i < RESTORATION_BORDER; ++i) {
411 if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
412
413 uint8_t *dst8 = data8_bl + i * data_stride;
414 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
415 }
416 }
417 } else {
418 if (copy_above) {
419 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
420
421 // Only restore i=-RESTORATION_BORDER line.
422 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
423 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
424 }
425
426 if (copy_below) {
427 const int stripe_bottom = limits->v_start + h;
428 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
429
430 // Only restore i=2 line.
431 if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
432 uint8_t *dst8 = data8_bl + 2 * data_stride;
433 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
434 }
435 }
436 }
437 }
438
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)439 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
440 int stripe_width, int stripe_height,
441 int procunit_width, const uint8_t *src,
442 int src_stride, uint8_t *dst, int dst_stride,
443 int32_t *tmpbuf, int bit_depth) {
444 (void)tmpbuf;
445 (void)bit_depth;
446 assert(bit_depth == 8);
447 const ConvolveParams conv_params = get_conv_params_wiener(8);
448
449 for (int j = 0; j < stripe_width; j += procunit_width) {
450 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
451 const uint8_t *src_p = src + j;
452 uint8_t *dst_p = dst + j;
453 av1_wiener_convolve_add_src(
454 src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
455 rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
456 }
457 }
458
459 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
460 over the input. The window is of size (2r + 1)x(2r + 1), and we
461 specialize to r = 1, 2, 3. A default function is used for r > 3.
462
463 Each loop follows the same format: We keep a window's worth of input
464 in individual variables and select data out of that as appropriate.
465 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)466 static void boxsum1(int32_t *src, int width, int height, int src_stride,
467 int sqr, int32_t *dst, int dst_stride) {
468 int i, j, a, b, c;
469 assert(width > 2 * SGRPROJ_BORDER_HORZ);
470 assert(height > 2 * SGRPROJ_BORDER_VERT);
471
472 // Vertical sum over 3-pixel regions, from src into dst.
473 if (!sqr) {
474 for (j = 0; j < width; ++j) {
475 a = src[j];
476 b = src[src_stride + j];
477 c = src[2 * src_stride + j];
478
479 dst[j] = a + b;
480 for (i = 1; i < height - 2; ++i) {
481 // Loop invariant: At the start of each iteration,
482 // a = src[(i - 1) * src_stride + j]
483 // b = src[(i ) * src_stride + j]
484 // c = src[(i + 1) * src_stride + j]
485 dst[i * dst_stride + j] = a + b + c;
486 a = b;
487 b = c;
488 c = src[(i + 2) * src_stride + j];
489 }
490 dst[i * dst_stride + j] = a + b + c;
491 dst[(i + 1) * dst_stride + j] = b + c;
492 }
493 } else {
494 for (j = 0; j < width; ++j) {
495 a = src[j] * src[j];
496 b = src[src_stride + j] * src[src_stride + j];
497 c = src[2 * src_stride + j] * src[2 * src_stride + j];
498
499 dst[j] = a + b;
500 for (i = 1; i < height - 2; ++i) {
501 dst[i * dst_stride + j] = a + b + c;
502 a = b;
503 b = c;
504 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
505 }
506 dst[i * dst_stride + j] = a + b + c;
507 dst[(i + 1) * dst_stride + j] = b + c;
508 }
509 }
510
511 // Horizontal sum over 3-pixel regions of dst
512 for (i = 0; i < height; ++i) {
513 a = dst[i * dst_stride];
514 b = dst[i * dst_stride + 1];
515 c = dst[i * dst_stride + 2];
516
517 dst[i * dst_stride] = a + b;
518 for (j = 1; j < width - 2; ++j) {
519 // Loop invariant: At the start of each iteration,
520 // a = src[i * src_stride + (j - 1)]
521 // b = src[i * src_stride + (j )]
522 // c = src[i * src_stride + (j + 1)]
523 dst[i * dst_stride + j] = a + b + c;
524 a = b;
525 b = c;
526 c = dst[i * dst_stride + (j + 2)];
527 }
528 dst[i * dst_stride + j] = a + b + c;
529 dst[i * dst_stride + (j + 1)] = b + c;
530 }
531 }
532
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)533 static void boxsum2(int32_t *src, int width, int height, int src_stride,
534 int sqr, int32_t *dst, int dst_stride) {
535 int i, j, a, b, c, d, e;
536 assert(width > 2 * SGRPROJ_BORDER_HORZ);
537 assert(height > 2 * SGRPROJ_BORDER_VERT);
538
539 // Vertical sum over 5-pixel regions, from src into dst.
540 if (!sqr) {
541 for (j = 0; j < width; ++j) {
542 a = src[j];
543 b = src[src_stride + j];
544 c = src[2 * src_stride + j];
545 d = src[3 * src_stride + j];
546 e = src[4 * src_stride + j];
547
548 dst[j] = a + b + c;
549 dst[dst_stride + j] = a + b + c + d;
550 for (i = 2; i < height - 3; ++i) {
551 // Loop invariant: At the start of each iteration,
552 // a = src[(i - 2) * src_stride + j]
553 // b = src[(i - 1) * src_stride + j]
554 // c = src[(i ) * src_stride + j]
555 // d = src[(i + 1) * src_stride + j]
556 // e = src[(i + 2) * src_stride + j]
557 dst[i * dst_stride + j] = a + b + c + d + e;
558 a = b;
559 b = c;
560 c = d;
561 d = e;
562 e = src[(i + 3) * src_stride + j];
563 }
564 dst[i * dst_stride + j] = a + b + c + d + e;
565 dst[(i + 1) * dst_stride + j] = b + c + d + e;
566 dst[(i + 2) * dst_stride + j] = c + d + e;
567 }
568 } else {
569 for (j = 0; j < width; ++j) {
570 a = src[j] * src[j];
571 b = src[src_stride + j] * src[src_stride + j];
572 c = src[2 * src_stride + j] * src[2 * src_stride + j];
573 d = src[3 * src_stride + j] * src[3 * src_stride + j];
574 e = src[4 * src_stride + j] * src[4 * src_stride + j];
575
576 dst[j] = a + b + c;
577 dst[dst_stride + j] = a + b + c + d;
578 for (i = 2; i < height - 3; ++i) {
579 dst[i * dst_stride + j] = a + b + c + d + e;
580 a = b;
581 b = c;
582 c = d;
583 d = e;
584 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
585 }
586 dst[i * dst_stride + j] = a + b + c + d + e;
587 dst[(i + 1) * dst_stride + j] = b + c + d + e;
588 dst[(i + 2) * dst_stride + j] = c + d + e;
589 }
590 }
591
592 // Horizontal sum over 5-pixel regions of dst
593 for (i = 0; i < height; ++i) {
594 a = dst[i * dst_stride];
595 b = dst[i * dst_stride + 1];
596 c = dst[i * dst_stride + 2];
597 d = dst[i * dst_stride + 3];
598 e = dst[i * dst_stride + 4];
599
600 dst[i * dst_stride] = a + b + c;
601 dst[i * dst_stride + 1] = a + b + c + d;
602 for (j = 2; j < width - 3; ++j) {
603 // Loop invariant: At the start of each iteration,
604 // a = src[i * src_stride + (j - 2)]
605 // b = src[i * src_stride + (j - 1)]
606 // c = src[i * src_stride + (j )]
607 // d = src[i * src_stride + (j + 1)]
608 // e = src[i * src_stride + (j + 2)]
609 dst[i * dst_stride + j] = a + b + c + d + e;
610 a = b;
611 b = c;
612 c = d;
613 d = e;
614 e = dst[i * dst_stride + (j + 3)];
615 }
616 dst[i * dst_stride + j] = a + b + c + d + e;
617 dst[i * dst_stride + (j + 1)] = b + c + d + e;
618 dst[i * dst_stride + (j + 2)] = c + d + e;
619 }
620 }
621
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)622 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
623 int sqr, int32_t *dst, int dst_stride) {
624 if (r == 1)
625 boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
626 else if (r == 2)
627 boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
628 else
629 assert(0 && "Invalid value of r in self-guided filter");
630 }
631
av1_decode_xq(const int * xqd,int * xq,const sgr_params_type * params)632 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
633 if (params->r[0] == 0) {
634 xq[0] = 0;
635 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
636 } else if (params->r[1] == 0) {
637 xq[0] = xqd[0];
638 xq[1] = 0;
639 } else {
640 xq[0] = xqd[0];
641 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
642 }
643 }
644
645 const int32_t av1_x_by_xplus1[256] = {
646 // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
647 // instead of 0. See comments in selfguided_restoration_internal() for why
648 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
649 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
650 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
651 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
652 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
653 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
654 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
655 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
656 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
657 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
658 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
659 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
660 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
661 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
662 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
663 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
664 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
665 256,
666 };
667
668 const int32_t av1_one_by_x[MAX_NELEM] = {
669 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
670 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
671 };
672
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)673 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
674 int dgd_stride, int bit_depth,
675 int sgr_params_idx, int radius_idx,
676 int pass, int32_t *A, int32_t *B) {
677 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
678 const int r = params->r[radius_idx];
679 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
680 const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
681 // Adjusting the stride of A and B here appears to avoid bad cache effects,
682 // leading to a significant speed improvement.
683 // We also align the stride to a multiple of 16 bytes, for consistency
684 // with the SIMD version of this function.
685 int buf_stride = ((width_ext + 3) & ~3) + 16;
686 const int step = pass == 0 ? 1 : 2;
687 int i, j;
688
689 assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
690 assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
691 "Need SGRPROJ_BORDER_* >= r+1");
692
693 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
694 width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
695 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
696 width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
697 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
698 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
699 // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
700 // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
701 for (i = -1; i < height + 1; i += step) {
702 for (j = -1; j < width + 1; ++j) {
703 const int k = i * buf_stride + j;
704 const int n = (2 * r + 1) * (2 * r + 1);
705
706 // a < 2^16 * n < 2^22 regardless of bit depth
707 uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
708 // b < 2^8 * n < 2^14 regardless of bit depth
709 uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
710
711 // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
712 // and p itself satisfies p < 2^14 * n^2 < 2^26.
713 // This bound on p is due to:
714 // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
715 //
716 // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
717 // This is an artefact of rounding, and can only happen if all pixels
718 // are (almost) identical, so in this case we saturate to p=0.
719 uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
720
721 const uint32_t s = params->s[radius_idx];
722
723 // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
724 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
725 // (this holds even after accounting for the rounding in s)
726 const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
727
728 // Note: We have to be quite careful about the value of A[k].
729 // This is used as a blend factor between individual pixel values and the
730 // local mean. So it logically has a range of [0, 256], including both
731 // endpoints.
732 //
733 // This is a pain for hardware, as we'd like something which can be stored
734 // in exactly 8 bits.
735 // Further, in the calculation of B[k] below, if z == 0 and r == 2,
736 // then A[k] "should be" 0. But then we can end up setting B[k] to a value
737 // slightly above 2^(8 + bit depth), due to rounding in the value of
738 // av1_one_by_x[25-1].
739 //
740 // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
741 // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
742 // overflow), without significantly affecting the final result: z == 0
743 // implies that the image is essentially "flat", so the local mean and
744 // individual pixel values are very similar.
745 //
746 // Note that saturating on the other side, ie. requring A[k] <= 255,
747 // would be a bad idea, as that corresponds to the case where the image
748 // is very variable, when we want to preserve the local pixel value as
749 // much as possible.
750 A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
751
752 // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
753 // av1_one_by_x[n - 1] = round(2^12 / n)
754 // => the product here is < 2^(20 + bit_depth) <= 2^32,
755 // and B[k] is set to a value < 2^(8 + bit depth)
756 // This holds even with the rounding in av1_one_by_x and in the overall
757 // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
758 B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
759 (uint32_t)B[k] *
760 (uint32_t)av1_one_by_x[n - 1],
761 SGRPROJ_RECIP_BITS);
762 }
763 }
764 }
765
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)766 static void selfguided_restoration_fast_internal(
767 int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
768 int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
769 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
770 const int r = params->r[radius_idx];
771 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772 // Adjusting the stride of A and B here appears to avoid bad cache effects,
773 // leading to a significant speed improvement.
774 // We also align the stride to a multiple of 16 bytes, for consistency
775 // with the SIMD version of this function.
776 int buf_stride = ((width_ext + 3) & ~3) + 16;
777 int32_t A_[RESTORATION_PROC_UNIT_PELS];
778 int32_t B_[RESTORATION_PROC_UNIT_PELS];
779 int32_t *A = A_;
780 int32_t *B = B_;
781 int i, j;
782 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
783 sgr_params_idx, radius_idx, 1, A, B);
784 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
785 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
786
787 // Use the A[] and B[] arrays to calculate the filtered image
788 (void)r;
789 assert(r == 2);
790 for (i = 0; i < height; ++i) {
791 if (!(i & 1)) { // even row
792 for (j = 0; j < width; ++j) {
793 const int k = i * buf_stride + j;
794 const int l = i * dgd_stride + j;
795 const int m = i * dst_stride + j;
796 const int nb = 5;
797 const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
798 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
799 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
800 5;
801 const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
802 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
803 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
804 5;
805 const int32_t v = a * dgd[l] + b;
806 dst[m] =
807 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
808 }
809 } else { // odd row
810 for (j = 0; j < width; ++j) {
811 const int k = i * buf_stride + j;
812 const int l = i * dgd_stride + j;
813 const int m = i * dst_stride + j;
814 const int nb = 4;
815 const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
816 const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
817 const int32_t v = a * dgd[l] + b;
818 dst[m] =
819 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
820 }
821 }
822 }
823 }
824
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)825 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
826 int dgd_stride, int32_t *dst,
827 int dst_stride, int bit_depth,
828 int sgr_params_idx,
829 int radius_idx) {
830 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
831 // Adjusting the stride of A and B here appears to avoid bad cache effects,
832 // leading to a significant speed improvement.
833 // We also align the stride to a multiple of 16 bytes, for consistency
834 // with the SIMD version of this function.
835 int buf_stride = ((width_ext + 3) & ~3) + 16;
836 int32_t A_[RESTORATION_PROC_UNIT_PELS];
837 int32_t B_[RESTORATION_PROC_UNIT_PELS];
838 int32_t *A = A_;
839 int32_t *B = B_;
840 int i, j;
841 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
842 sgr_params_idx, radius_idx, 0, A, B);
843 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
844 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
845
846 // Use the A[] and B[] arrays to calculate the filtered image
847 for (i = 0; i < height; ++i) {
848 for (j = 0; j < width; ++j) {
849 const int k = i * buf_stride + j;
850 const int l = i * dgd_stride + j;
851 const int m = i * dst_stride + j;
852 const int nb = 5;
853 const int32_t a =
854 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
855 4 +
856 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
857 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
858 3;
859 const int32_t b =
860 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
861 4 +
862 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
863 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
864 3;
865 const int32_t v = a * dgd[l] + b;
866 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
867 }
868 }
869 }
870
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)871 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
872 int dgd_stride, int32_t *flt0, int32_t *flt1,
873 int flt_stride, int sgr_params_idx,
874 int bit_depth, int highbd) {
875 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
876 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
877 int32_t *dgd32 =
878 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
879
880 if (highbd) {
881 const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
882 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
883 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
884 dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
885 }
886 }
887 } else {
888 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
889 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
890 dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
891 }
892 }
893 }
894
895 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
896 // If params->r == 0 we skip the corresponding filter. We only allow one of
897 // the radii to be 0, as having both equal to 0 would be equivalent to
898 // skipping SGR entirely.
899 assert(!(params->r[0] == 0 && params->r[1] == 0));
900
901 if (params->r[0] > 0)
902 selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
903 flt0, flt_stride, bit_depth,
904 sgr_params_idx, 0);
905 if (params->r[1] > 0)
906 selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
907 flt_stride, bit_depth, sgr_params_idx, 1);
908 return 0;
909 }
910
av1_apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)911 void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
912 int height, int stride, int eps,
913 const int *xqd, uint8_t *dst8,
914 int dst_stride, int32_t *tmpbuf,
915 int bit_depth, int highbd) {
916 int32_t *flt0 = tmpbuf;
917 int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
918 assert(width * height <= RESTORATION_UNITPELS_MAX);
919
920 const int ret = av1_selfguided_restoration_c(
921 dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
922 (void)ret;
923 assert(!ret);
924 const sgr_params_type *const params = &av1_sgr_params[eps];
925 int xq[2];
926 av1_decode_xq(xqd, xq, params);
927 for (int i = 0; i < height; ++i) {
928 for (int j = 0; j < width; ++j) {
929 const int k = i * width + j;
930 uint8_t *dst8ij = dst8 + i * dst_stride + j;
931 const uint8_t *dat8ij = dat8 + i * stride + j;
932
933 const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
934 const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
935 int32_t v = u << SGRPROJ_PRJ_BITS;
936 // If params->r == 0 then we skipped the filtering in
937 // av1_selfguided_restoration_c, i.e. flt[k] == u
938 if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
939 if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
940 const int16_t w =
941 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
942
943 const uint16_t out = clip_pixel_highbd(w, bit_depth);
944 if (highbd)
945 *CONVERT_TO_SHORTPTR(dst8ij) = out;
946 else
947 *dst8ij = (uint8_t)out;
948 }
949 }
950 }
951
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)952 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
953 int stripe_width, int stripe_height,
954 int procunit_width, const uint8_t *src,
955 int src_stride, uint8_t *dst, int dst_stride,
956 int32_t *tmpbuf, int bit_depth) {
957 (void)bit_depth;
958 assert(bit_depth == 8);
959
960 for (int j = 0; j < stripe_width; j += procunit_width) {
961 int w = AOMMIN(procunit_width, stripe_width - j);
962 av1_apply_selfguided_restoration(
963 src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
964 rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
965 }
966 }
967
968 #if CONFIG_AV1_HIGHBITDEPTH
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)969 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
970 int stripe_width, int stripe_height,
971 int procunit_width, const uint8_t *src8,
972 int src_stride, uint8_t *dst8,
973 int dst_stride, int32_t *tmpbuf,
974 int bit_depth) {
975 (void)tmpbuf;
976 const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
977
978 for (int j = 0; j < stripe_width; j += procunit_width) {
979 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
980 const uint8_t *src8_p = src8 + j;
981 uint8_t *dst8_p = dst8 + j;
982 av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
983 rui->wiener_info.hfilter, 16,
984 rui->wiener_info.vfilter, 16, w,
985 stripe_height, &conv_params, bit_depth);
986 }
987 }
988
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)989 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
990 int stripe_width, int stripe_height,
991 int procunit_width,
992 const uint8_t *src8, int src_stride,
993 uint8_t *dst8, int dst_stride,
994 int32_t *tmpbuf, int bit_depth) {
995 for (int j = 0; j < stripe_width; j += procunit_width) {
996 int w = AOMMIN(procunit_width, stripe_width - j);
997 av1_apply_selfguided_restoration(
998 src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
999 rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1000 }
1001 }
1002 #endif // CONFIG_AV1_HIGHBITDEPTH
1003
1004 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
1005 int stripe_width, int stripe_height,
1006 int procunit_width, const uint8_t *src,
1007 int src_stride, uint8_t *dst, int dst_stride,
1008 int32_t *tmpbuf, int bit_depth);
1009
1010 #if CONFIG_AV1_HIGHBITDEPTH
1011 #define NUM_STRIPE_FILTERS 4
1012 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1013 wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1014 sgrproj_filter_stripe_highbd
1015 };
1016 #else
1017 #define NUM_STRIPE_FILTERS 2
1018 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1019 wiener_filter_stripe, sgrproj_filter_stripe
1020 };
1021 #endif // CONFIG_AV1_HIGHBITDEPTH
1022
1023 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,const AV1PixelRect * tile_rect,int tile_stripe0,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr)1024 void av1_loop_restoration_filter_unit(
1025 const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1026 const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1027 const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1028 int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1029 int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1030 RestorationType unit_rtype = rui->restoration_type;
1031
1032 int unit_h = limits->v_end - limits->v_start;
1033 int unit_w = limits->h_end - limits->h_start;
1034 uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1035 uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1036
1037 if (unit_rtype == RESTORE_NONE) {
1038 copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1039 return;
1040 }
1041
1042 const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1043 assert(filter_idx < NUM_STRIPE_FILTERS);
1044 const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1045
1046 const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1047
1048 // Convolve the whole tile one stripe at a time
1049 RestorationTileLimits remaining_stripes = *limits;
1050 int i = 0;
1051 while (i < unit_h) {
1052 int copy_above, copy_below;
1053 remaining_stripes.v_start = limits->v_start + i;
1054
1055 get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, ©_above,
1056 ©_below);
1057
1058 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1059 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1060
1061 // Work out where this stripe's boundaries are within
1062 // rsb->stripe_boundary_{above,below}
1063 const int tile_stripe =
1064 (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1065 full_stripe_height;
1066 const int frame_stripe = tile_stripe0 + tile_stripe;
1067 const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1068
1069 // Calculate this stripe's height, based on two rules:
1070 // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1071 // * We can't extend past the end of the current restoration unit
1072 const int nominal_stripe_height =
1073 full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1074 const int h = AOMMIN(nominal_stripe_height,
1075 remaining_stripes.v_end - remaining_stripes.v_start);
1076
1077 setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1078 h, data8, stride, rlbs, copy_above,
1079 copy_below, optimized_lr);
1080
1081 stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1082 dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1083
1084 restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1085 data8, stride, copy_above, copy_below,
1086 optimized_lr);
1087
1088 i += h;
1089 }
1090 }
1091
filter_frame_on_unit(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1092 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1093 const AV1PixelRect *tile_rect,
1094 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1095 RestorationLineBuffers *rlbs) {
1096 FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1097 const RestorationInfo *rsi = ctxt->rsi;
1098
1099 av1_loop_restoration_filter_unit(
1100 limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1101 ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1102 ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1103 rsi->optimized_lr);
1104 }
1105
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1106 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1107 YV12_BUFFER_CONFIG *frame,
1108 AV1_COMMON *cm, int optimized_lr,
1109 int num_planes) {
1110 const SequenceHeader *const seq_params = &cm->seq_params;
1111 const int bit_depth = seq_params->bit_depth;
1112 const int highbd = seq_params->use_highbitdepth;
1113 lr_ctxt->dst = &cm->rst_frame;
1114
1115 const int frame_width = frame->crop_widths[0];
1116 const int frame_height = frame->crop_heights[0];
1117 if (aom_realloc_frame_buffer(
1118 lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1119 seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1120 cm->features.byte_alignment, NULL, NULL, NULL) < 0)
1121 aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1122 "Failed to allocate restoration dst buffer");
1123
1124 lr_ctxt->on_rest_unit = filter_frame_on_unit;
1125 lr_ctxt->frame = frame;
1126 for (int plane = 0; plane < num_planes; ++plane) {
1127 RestorationInfo *rsi = &cm->rst_info[plane];
1128 RestorationType rtype = rsi->frame_restoration_type;
1129 rsi->optimized_lr = optimized_lr;
1130
1131 if (rtype == RESTORE_NONE) {
1132 continue;
1133 }
1134
1135 const int is_uv = plane > 0;
1136 const int plane_width = frame->crop_widths[is_uv];
1137 const int plane_height = frame->crop_heights[is_uv];
1138 FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1139
1140 av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
1141 frame->strides[is_uv], RESTORATION_BORDER,
1142 RESTORATION_BORDER, highbd);
1143
1144 lr_plane_ctxt->rsi = rsi;
1145 lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1146 lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1147 lr_plane_ctxt->highbd = highbd;
1148 lr_plane_ctxt->bit_depth = bit_depth;
1149 lr_plane_ctxt->data8 = frame->buffers[plane];
1150 lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1151 lr_plane_ctxt->data_stride = frame->strides[is_uv];
1152 lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1153 lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1154 lr_plane_ctxt->tile_stripe0 = 0;
1155 }
1156 }
1157
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1158 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1159 AV1_COMMON *cm, int num_planes) {
1160 typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1161 YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1162 int vstart, int vend);
1163 static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1164 aom_yv12_partial_coloc_copy_u,
1165 aom_yv12_partial_coloc_copy_v };
1166 assert(num_planes <= 3);
1167 for (int plane = 0; plane < num_planes; ++plane) {
1168 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1169 AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1170 copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1171 tile_rect.right, tile_rect.top, tile_rect.bottom);
1172 }
1173 }
1174
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1175 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1176 int num_planes) {
1177 FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1178
1179 for (int plane = 0; plane < num_planes; ++plane) {
1180 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1181 continue;
1182 }
1183
1184 av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1185 &ctxt[plane], &ctxt[plane].tile_rect,
1186 cm->rst_tmpbuf, cm->rlbs);
1187 }
1188 }
1189
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1190 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1191 AV1_COMMON *cm, int optimized_lr,
1192 void *lr_ctxt) {
1193 assert(!cm->features.all_lossless);
1194 const int num_planes = av1_num_planes(cm);
1195
1196 AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1197
1198 av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1199 optimized_lr, num_planes);
1200
1201 foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1202
1203 av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1204 }
1205
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,const AV1PixelRect * tile_rect,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int unit_idx0,int hunits_per_tile,int vunits_per_tile,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync)1206 void av1_foreach_rest_unit_in_row(
1207 RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1208 rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1209 int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1210 void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1211 sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1212 struct AV1LrSyncData *const lr_sync) {
1213 const int tile_w = tile_rect->right - tile_rect->left;
1214 const int ext_size = unit_size * 3 / 2;
1215 int x0 = 0, j = 0;
1216 while (x0 < tile_w) {
1217 int remaining_w = tile_w - x0;
1218 int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1219
1220 limits->h_start = tile_rect->left + x0;
1221 limits->h_end = tile_rect->left + x0 + w;
1222 assert(limits->h_end <= tile_rect->right);
1223
1224 const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1225
1226 // No sync for even numbered rows
1227 // For odd numbered rows, Loop Restoration of current block requires the LR
1228 // of top-right and bottom-right blocks to be completed
1229
1230 // top-right sync
1231 on_sync_read(lr_sync, row_number, j, plane);
1232 if ((row_number + 1) < vunits_per_tile)
1233 // bottom-right sync
1234 on_sync_read(lr_sync, row_number + 2, j, plane);
1235
1236 on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1237
1238 on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1239
1240 x0 += w;
1241 ++j;
1242 }
1243 }
1244
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1245 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1246 (void)lr_sync;
1247 (void)r;
1248 (void)c;
1249 (void)plane;
1250 }
1251
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1252 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1253 const int sb_cols, int plane) {
1254 (void)lr_sync;
1255 (void)r;
1256 (void)c;
1257 (void)sb_cols;
1258 (void)plane;
1259 }
1260
foreach_rest_unit_in_tile(const AV1PixelRect * tile_rect,int tile_row,int tile_col,int tile_cols,int hunits_per_tile,int vunits_per_tile,int units_per_tile,int unit_size,int ss_y,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1261 static void foreach_rest_unit_in_tile(
1262 const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1263 int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1264 int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1265 int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1266 const int tile_h = tile_rect->bottom - tile_rect->top;
1267 const int ext_size = unit_size * 3 / 2;
1268
1269 const int tile_idx = tile_col + tile_row * tile_cols;
1270 const int unit_idx0 = tile_idx * units_per_tile;
1271
1272 int y0 = 0, i = 0;
1273 while (y0 < tile_h) {
1274 int remaining_h = tile_h - y0;
1275 int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1276
1277 RestorationTileLimits limits;
1278 limits.v_start = tile_rect->top + y0;
1279 limits.v_end = tile_rect->top + y0 + h;
1280 assert(limits.v_end <= tile_rect->bottom);
1281 // Offset the tile upwards to align with the restoration processing stripe
1282 const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1283 limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1284 if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1285
1286 av1_foreach_rest_unit_in_row(
1287 &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1288 hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1289 av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1290
1291 y0 += h;
1292 ++i;
1293 }
1294 }
1295
av1_foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,AV1PixelRect * tile_rect,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1296 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1297 rest_unit_visitor_t on_rest_unit,
1298 void *priv, AV1PixelRect *tile_rect,
1299 int32_t *tmpbuf,
1300 RestorationLineBuffers *rlbs) {
1301 const int is_uv = plane > 0;
1302 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1303
1304 const RestorationInfo *rsi = &cm->rst_info[plane];
1305
1306 foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1307 rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1308 rsi->units_per_tile, rsi->restoration_unit_size,
1309 ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1310 }
1311
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1312 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1313 int mi_row, int mi_col, BLOCK_SIZE bsize,
1314 int *rcol0, int *rcol1, int *rrow0,
1315 int *rrow1) {
1316 assert(rcol0 && rcol1 && rrow0 && rrow1);
1317
1318 if (bsize != cm->seq_params.sb_size) return 0;
1319 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1320
1321 assert(!cm->features.all_lossless);
1322
1323 const int is_uv = plane > 0;
1324
1325 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1326 const int tile_w = tile_rect.right - tile_rect.left;
1327 const int tile_h = tile_rect.bottom - tile_rect.top;
1328
1329 const int mi_top = 0;
1330 const int mi_left = 0;
1331
1332 // Compute the mi-unit corners of the superblock relative to the top-left of
1333 // the tile
1334 const int mi_rel_row0 = mi_row - mi_top;
1335 const int mi_rel_col0 = mi_col - mi_left;
1336 const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1337 const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1338
1339 const RestorationInfo *rsi = &cm->rst_info[plane];
1340 const int size = rsi->restoration_unit_size;
1341
1342 // Calculate the number of restoration units in this tile (which might be
1343 // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1344 const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1345 const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1346
1347 // The size of an MI-unit on this plane of the image
1348 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1349 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1350 const int mi_size_x = MI_SIZE >> ss_x;
1351 const int mi_size_y = MI_SIZE >> ss_y;
1352
1353 // Write m for the relative mi column or row, D for the superres denominator
1354 // and N for the superres numerator. If u is the upscaled pixel offset then
1355 // we can write the downscaled pixel offset in two ways as:
1356 //
1357 // MI_SIZE * m = N / D u
1358 //
1359 // from which we get u = D * MI_SIZE * m / N
1360 const int mi_to_num_x = av1_superres_scaled(cm)
1361 ? mi_size_x * cm->superres_scale_denominator
1362 : mi_size_x;
1363 const int mi_to_num_y = mi_size_y;
1364 const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1365 const int denom_y = size;
1366
1367 const int rnd_x = denom_x - 1;
1368 const int rnd_y = denom_y - 1;
1369
1370 // rcol0/rrow0 should be the first column/row of restoration units (relative
1371 // to the top-left of the tile) that doesn't start left/below of
1372 // mi_col/mi_row. For this calculation, we need to round up the division (if
1373 // the sb starts at runit column 10.1, the first matching runit has column
1374 // index 11)
1375 *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1376 *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1377
1378 // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1379 // below-right. If we're at the bottom or right of the tile, this restoration
1380 // unit might not exist, in which case we'll clamp accordingly.
1381 *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1382 *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1383
1384 return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1385 }
1386
1387 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1388 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1389 int extend, int use_highbitdepth) {
1390 for (int i = 0; i < height; ++i) {
1391 if (use_highbitdepth) {
1392 uint16_t *buf16 = (uint16_t *)buf;
1393 aom_memset16(buf16 - extend, buf16[0], extend);
1394 aom_memset16(buf16 + width, buf16[width - 1], extend);
1395 } else {
1396 memset(buf - extend, buf[0], extend);
1397 memset(buf + width, buf[width - 1], extend);
1398 }
1399 buf += stride;
1400 }
1401 }
1402
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1403 static void save_deblock_boundary_lines(
1404 const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1405 int stripe, int use_highbd, int is_above,
1406 RestorationStripeBoundaries *boundaries) {
1407 const int is_uv = plane > 0;
1408 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1409 const int src_stride = frame->strides[is_uv] << use_highbd;
1410 const uint8_t *src_rows = src_buf + row * src_stride;
1411
1412 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1413 : boundaries->stripe_boundary_below;
1414 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1415 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1416 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1417
1418 // There is a rare case in which a processing stripe can end 1px above the
1419 // crop border. In this case, we do want to use deblocked pixels from below
1420 // the stripe (hence why we ended up in this function), but instead of
1421 // fetching 2 "below" rows we need to fetch one and duplicate it.
1422 // This is equivalent to clamping the sample locations against the crop border
1423 const int lines_to_save =
1424 AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1425 assert(lines_to_save == 1 || lines_to_save == 2);
1426
1427 int upscaled_width;
1428 int line_bytes;
1429 if (av1_superres_scaled(cm)) {
1430 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1431 upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1432 line_bytes = upscaled_width << use_highbd;
1433 if (use_highbd)
1434 av1_upscale_normative_rows(
1435 cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1436 CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1437 plane, lines_to_save);
1438 else
1439 av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1440 boundaries->stripe_boundary_stride, plane,
1441 lines_to_save);
1442 } else {
1443 upscaled_width = frame->crop_widths[is_uv];
1444 line_bytes = upscaled_width << use_highbd;
1445 for (int i = 0; i < lines_to_save; i++) {
1446 memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1447 line_bytes);
1448 }
1449 }
1450 // If we only saved one line, then copy it into the second line buffer
1451 if (lines_to_save == 1)
1452 memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1453
1454 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1455 RESTORATION_EXTRA_HORZ, use_highbd);
1456 }
1457
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1458 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1459 const AV1_COMMON *cm, int plane, int row,
1460 int stripe, int use_highbd, int is_above,
1461 RestorationStripeBoundaries *boundaries) {
1462 const int is_uv = plane > 0;
1463 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1464 const int src_stride = frame->strides[is_uv] << use_highbd;
1465 const uint8_t *src_rows = src_buf + row * src_stride;
1466
1467 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1468 : boundaries->stripe_boundary_below;
1469 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1470 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1471 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1472 const int src_width = frame->crop_widths[is_uv];
1473
1474 // At the point where this function is called, we've already applied
1475 // superres. So we don't need to extend the lines here, we can just
1476 // pull directly from the topmost row of the upscaled frame.
1477 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1478 const int upscaled_width = av1_superres_scaled(cm)
1479 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1480 : src_width;
1481 const int line_bytes = upscaled_width << use_highbd;
1482 for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1483 // Copy the line at 'row' into both context lines. This is because
1484 // we want to (effectively) extend the outermost row of CDEF data
1485 // from this tile to produce a border, rather than using deblocked
1486 // pixels from the tile above/below.
1487 memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1488 }
1489 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1490 RESTORATION_EXTRA_HORZ, use_highbd);
1491 }
1492
save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1493 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1494 int use_highbd, int plane,
1495 AV1_COMMON *cm, int after_cdef) {
1496 const int is_uv = plane > 0;
1497 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1498 const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1499 const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1500
1501 // Get the tile rectangle, with height rounded up to the next multiple of 8
1502 // luma pixels (only relevant for the bottom tile of the frame)
1503 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1504 const int stripe0 = 0;
1505
1506 RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1507
1508 const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1509
1510 int tile_stripe;
1511 for (tile_stripe = 0;; ++tile_stripe) {
1512 const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1513 const int y0 = tile_rect.top + rel_y0;
1514 if (y0 >= tile_rect.bottom) break;
1515
1516 const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1517 const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1518
1519 const int frame_stripe = stripe0 + tile_stripe;
1520
1521 // In this case, we should only use CDEF pixels at the top
1522 // and bottom of the frame as a whole; internal tile boundaries
1523 // can use deblocked pixels from adjacent tiles for context.
1524 const int use_deblock_above = (frame_stripe > 0);
1525 const int use_deblock_below = (y1 < plane_height);
1526
1527 if (!after_cdef) {
1528 // Save deblocked context where needed.
1529 if (use_deblock_above) {
1530 save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1531 frame_stripe, use_highbd, 1, boundaries);
1532 }
1533 if (use_deblock_below) {
1534 save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1535 use_highbd, 0, boundaries);
1536 }
1537 } else {
1538 // Save CDEF context where needed. Note that we need to save the CDEF
1539 // context for a particular boundary iff we *didn't* save deblocked
1540 // context for that boundary.
1541 //
1542 // In addition, we need to save copies of the outermost line within
1543 // the tile, rather than using data from outside the tile.
1544 if (!use_deblock_above) {
1545 save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1546 1, boundaries);
1547 }
1548 if (!use_deblock_below) {
1549 save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1550 use_highbd, 0, boundaries);
1551 }
1552 }
1553 }
1554 }
1555
1556 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1557 // lines to be used as boundary in the loop restoration process. The
1558 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1559 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1560 AV1_COMMON *cm, int after_cdef) {
1561 const int num_planes = av1_num_planes(cm);
1562 const int use_highbd = cm->seq_params.use_highbitdepth;
1563 for (int p = 0; p < num_planes; ++p) {
1564 save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1565 }
1566 }
1567