1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 *
11 */
12
13 #include <math.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18
19 #include "aom_mem/aom_mem.h"
20 #include "av1/common/onyxc_int.h"
21 #include "av1/common/resize.h"
22 #include "av1/common/restoration.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_mem/aom_mem.h"
25
26 #include "aom_ports/mem.h"
27
28 // The 's' values are calculated based on original 'r' and 'e' values in the
29 // spec using GenSgrprojVtable().
30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
32 { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33 { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
34 { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
35 { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
36 { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
37 { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
38 { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
39 { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
40 };
41
av1_whole_frame_rect(const AV1_COMMON * cm,int is_uv)42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43 AV1PixelRect rect;
44
45 int ss_x = is_uv && cm->seq_params.subsampling_x;
46 int ss_y = is_uv && cm->seq_params.subsampling_y;
47
48 rect.top = 0;
49 rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50 rect.left = 0;
51 rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52 return rect;
53 }
54
55 // Count horizontal or vertical units per tile (use a width or height for
56 // tile_size, respectively). We basically want to divide the tile size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height. The
60 // max with 1 is to deal with tiles that are smaller than half of a restoration
61 // unit.
av1_lr_count_units_in_tile(int unit_size,int tile_size)62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63 return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64 }
65
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67 int is_uv) {
68 // We need to allocate enough space for restoration units to cover the
69 // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70 // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71 // to do the computation ourselves, iterating over the tiles and keeping
72 // track of the largest width and height, then upscaling.
73 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74 const int max_tile_w = tile_rect.right - tile_rect.left;
75 const int max_tile_h = tile_rect.bottom - tile_rect.top;
76
77 // To calculate hpertile and vpertile (horizontal and vertical units per
78 // tile), we basically want to divide the largest tile width or height by the
79 // size of a restoration unit. Rather than rounding up unconditionally as you
80 // might expect, we round to nearest, which models the way a right or bottom
81 // restoration unit can extend to up to 150% its normal width or height. The
82 // max with 1 is to deal with tiles that are smaller than half of a
83 // restoration unit.
84 const int unit_size = rsi->restoration_unit_size;
85 const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86 const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87
88 rsi->units_per_tile = hpertile * vpertile;
89 rsi->horz_units_per_tile = hpertile;
90 rsi->vert_units_per_tile = vpertile;
91
92 const int ntiles = 1;
93 const int nunits = ntiles * rsi->units_per_tile;
94
95 aom_free(rsi->unit_info);
96 CHECK_MEM_ERROR(cm, rsi->unit_info,
97 (RestorationUnitInfo *)aom_memalign(
98 16, sizeof(*rsi->unit_info) * nunits));
99 }
100
av1_free_restoration_struct(RestorationInfo * rst_info)101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
102 aom_free(rst_info->unit_info);
103 rst_info->unit_info = NULL;
104 }
105
106 #if 0
107 // Pair of values for each sgrproj parameter:
108 // Index 0 corresponds to r[0], e[0]
109 // Index 1 corresponds to r[1], e[1]
110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
111
112 static void GenSgrprojVtable() {
113 for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114 const sgr_params_type *const params = &sgr_params[i];
115 for (int j = 0; j < 2; ++j) {
116 const int e = params->e[j];
117 const int r = params->r[j];
118 if (r == 0) { // filter is disabled
119 sgrproj_mtable[i][j] = -1; // mark invalid
120 } else { // filter is enabled
121 const int n = (2 * r + 1) * (2 * r + 1);
122 const int n2e = n * n * e;
123 assert(n2e != 0);
124 sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125 }
126 }
127 }
128 }
129 #endif
130
av1_loop_restoration_precal()131 void av1_loop_restoration_precal() {
132 #if 0
133 GenSgrprojVtable();
134 #endif
135 }
136
extend_frame_lowbd(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138 int border_horz, int border_vert) {
139 uint8_t *data_p;
140 int i;
141 for (i = 0; i < height; ++i) {
142 data_p = data + i * stride;
143 memset(data_p - border_horz, data_p[0], border_horz);
144 memset(data_p + width, data_p[width - 1], border_horz);
145 }
146 data_p = data - border_horz;
147 for (i = -border_vert; i < 0; ++i) {
148 memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149 }
150 for (i = height; i < height + border_vert; ++i) {
151 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152 width + 2 * border_horz);
153 }
154 }
155
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)156 static void extend_frame_highbd(uint16_t *data, int width, int height,
157 int stride, int border_horz, int border_vert) {
158 uint16_t *data_p;
159 int i, j;
160 for (i = 0; i < height; ++i) {
161 data_p = data + i * stride;
162 for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
163 for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
164 }
165 data_p = data - border_horz;
166 for (i = -border_vert; i < 0; ++i) {
167 memcpy(data_p + i * stride, data_p,
168 (width + 2 * border_horz) * sizeof(uint16_t));
169 }
170 for (i = height; i < height + border_vert; ++i) {
171 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
172 (width + 2 * border_horz) * sizeof(uint16_t));
173 }
174 }
175
extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)176 void extend_frame(uint8_t *data, int width, int height, int stride,
177 int border_horz, int border_vert, int highbd) {
178 if (highbd)
179 extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
180 border_horz, border_vert);
181 else
182 extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183 }
184
copy_tile_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)185 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
186 int src_stride, uint8_t *dst, int dst_stride) {
187 for (int i = 0; i < height; ++i)
188 memcpy(dst + i * dst_stride, src + i * src_stride, width);
189 }
190
copy_tile_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)191 static void copy_tile_highbd(int width, int height, const uint16_t *src,
192 int src_stride, uint16_t *dst, int dst_stride) {
193 for (int i = 0; i < height; ++i)
194 memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
195 }
196
copy_tile(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)197 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
198 uint8_t *dst, int dst_stride, int highbd) {
199 if (highbd)
200 copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
201 CONVERT_TO_SHORTPTR(dst), dst_stride);
202 else
203 copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
204 }
205
206 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
207
208 // With striped loop restoration, the filtering for each 64-pixel stripe gets
209 // most of its input from the output of CDEF (stored in data8), but we need to
210 // fill out a border of 3 pixels above/below the stripe according to the
211 // following
212 // rules:
213 //
214 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
215 // This extension is done by a call to extend_frame() at the start of the loop
216 // restoration process, so the value of copy_above/copy_below doesn't strictly
217 // matter.
218 // However, by setting *copy_above = *copy_below = 1 whenever loop filtering
219 // across tiles is disabled, we can allow
220 // {setup,restore}_processing_stripe_boundary to assume that the top/bottom
221 // data has always been copied, simplifying the behaviour at the left and
222 // right edges of tiles.
223 //
224 // * If we're at a tile boundary and loop filtering across tiles is enabled,
225 // then there is a logical stripe which is 64 pixels high, but which is split
226 // into an 8px high and a 56px high stripe so that the processing (and
227 // coefficient set usage) can be aligned to tiles.
228 // In this case, we use the 3 rows of CDEF output across the boundary for
229 // context; this corresponds to leaving the frame buffer as-is.
230 //
231 // * If we're at a tile boundary and loop filtering across tiles is disabled,
232 // then we take the outermost row of CDEF pixels *within the current tile*
233 // and copy it three times. Thus we behave exactly as if the tile were a full
234 // frame.
235 //
236 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
237 // take 2 rows of deblocked pixels and extend them to 3 rows of context.
238 //
239 // The distinction between the latter two cases is handled by the
240 // av1_loop_restoration_save_boundary_lines() function, so here we just need
241 // to decide if we're overwriting the above/below boundary pixels or not.
get_stripe_boundary_info(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int ss_y,int * copy_above,int * copy_below)242 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
243 const AV1PixelRect *tile_rect, int ss_y,
244 int *copy_above, int *copy_below) {
245 *copy_above = 1;
246 *copy_below = 1;
247
248 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
249 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
250
251 const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
252 const int this_stripe_height =
253 full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
254 const int last_stripe_in_tile =
255 (limits->v_start + this_stripe_height >= tile_rect->bottom);
256
257 if (first_stripe_in_tile) *copy_above = 0;
258 if (last_stripe_in_tile) *copy_below = 0;
259 }
260
261 // Overwrite the border pixels around a processing stripe so that the conditions
262 // listed above get_stripe_boundary_info() are preserved.
263 // We save the pixels which get overwritten into a temporary buffer, so that
264 // they can be restored by restore_processing_stripe_boundary() after we've
265 // processed the stripe.
266 //
267 // limits gives the rectangular limits of the remaining stripes for the current
268 // restoration unit. rsb is the stored stripe boundaries (taken from either
269 // deblock or CDEF output as necessary).
270 //
271 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
272 // the first stripe in this tile (needed to convert the tile-relative stripe
273 // index we get from limits into something we can look up in rsb).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)274 static void setup_processing_stripe_boundary(
275 const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
276 int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
277 RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
278 // Offsets within the line buffers. The buffer logically starts at column
279 // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
280 // has column x0 in the buffer.
281 const int buf_stride = rsb->stripe_boundary_stride;
282 const int buf_x0_off = limits->h_start;
283 const int line_width =
284 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
285 const int line_size = line_width << use_highbd;
286
287 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
288
289 // Replace RESTORATION_BORDER pixels above the top of the stripe
290 // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
291 // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
292 // duplicating the topmost of the 2 lines (see the AOMMAX call when
293 // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
294 //
295 // Special case: If we're at the top of a tile, which isn't on the topmost
296 // tile row, and we're allowed to loop filter across tiles, then we have a
297 // logical 64-pixel-high stripe which has been split into an 8-pixel high
298 // stripe and a 56-pixel high stripe (the current one). So, in this case,
299 // we want to leave the boundary alone!
300 if (!opt) {
301 if (copy_above) {
302 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
303
304 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
305 const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
306 const int buf_off = buf_x0_off + buf_row * buf_stride;
307 const uint8_t *buf =
308 rsb->stripe_boundary_above + (buf_off << use_highbd);
309 uint8_t *dst8 = data8_tl + i * data_stride;
310 // Save old pixels, then replace with data from stripe_boundary_above
311 memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
312 REAL_PTR(use_highbd, dst8), line_size);
313 memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
314 }
315 }
316
317 // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
318 // The second buffer row is repeated, so src_row gets the values 0, 1, 1
319 // for i = 0, 1, 2.
320 if (copy_below) {
321 const int stripe_end = limits->v_start + h;
322 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
323
324 for (int i = 0; i < RESTORATION_BORDER; ++i) {
325 const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
326 const int buf_off = buf_x0_off + buf_row * buf_stride;
327 const uint8_t *src =
328 rsb->stripe_boundary_below + (buf_off << use_highbd);
329
330 uint8_t *dst8 = data8_bl + i * data_stride;
331 // Save old pixels, then replace with data from stripe_boundary_below
332 memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
333 memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
334 }
335 }
336 } else {
337 if (copy_above) {
338 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
339
340 // Only save and overwrite i=-RESTORATION_BORDER line.
341 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
342 // Save old pixels, then replace with data from stripe_boundary_above
343 memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
344 memcpy(REAL_PTR(use_highbd, dst8),
345 REAL_PTR(use_highbd,
346 data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
347 line_size);
348 }
349
350 if (copy_below) {
351 const int stripe_end = limits->v_start + h;
352 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
353
354 // Only save and overwrite i=2 line.
355 uint8_t *dst8 = data8_bl + 2 * data_stride;
356 // Save old pixels, then replace with data from stripe_boundary_below
357 memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
358 memcpy(REAL_PTR(use_highbd, dst8),
359 REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
360 }
361 }
362 }
363
364 // This function restores the boundary lines modified by
365 // setup_processing_stripe_boundary.
366 //
367 // Note: We need to be careful when handling the corners of the processing
368 // unit, because (eg.) the top-left corner is considered to be part of
369 // both the left and top borders. This means that, depending on the
370 // loop_filter_across_tiles_enabled flag, the corner pixels might get
371 // overwritten twice, once as part of the "top" border and once as part
372 // of the "left" border (or similar for other corners).
373 //
374 // Everything works out fine as long as we make sure to reverse the order
375 // when restoring, ie. we need to restore the left/right borders followed
376 // by the top/bottom borders.
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)377 static void restore_processing_stripe_boundary(
378 const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
379 int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
380 int copy_below, int opt) {
381 const int line_width =
382 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
383 const int line_size = line_width << use_highbd;
384
385 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
386
387 if (!opt) {
388 if (copy_above) {
389 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
390 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
391 uint8_t *dst8 = data8_tl + i * data_stride;
392 memcpy(REAL_PTR(use_highbd, dst8),
393 rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
394 }
395 }
396
397 if (copy_below) {
398 const int stripe_bottom = limits->v_start + h;
399 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
400
401 for (int i = 0; i < RESTORATION_BORDER; ++i) {
402 if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
403
404 uint8_t *dst8 = data8_bl + i * data_stride;
405 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
406 }
407 }
408 } else {
409 if (copy_above) {
410 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
411
412 // Only restore i=-RESTORATION_BORDER line.
413 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
414 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
415 }
416
417 if (copy_below) {
418 const int stripe_bottom = limits->v_start + h;
419 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
420
421 // Only restore i=2 line.
422 if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
423 uint8_t *dst8 = data8_bl + 2 * data_stride;
424 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
425 }
426 }
427 }
428 }
429
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)430 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
431 int stripe_width, int stripe_height,
432 int procunit_width, const uint8_t *src,
433 int src_stride, uint8_t *dst, int dst_stride,
434 int32_t *tmpbuf, int bit_depth) {
435 (void)tmpbuf;
436 (void)bit_depth;
437 assert(bit_depth == 8);
438 const ConvolveParams conv_params = get_conv_params_wiener(8);
439
440 for (int j = 0; j < stripe_width; j += procunit_width) {
441 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
442 const uint8_t *src_p = src + j;
443 uint8_t *dst_p = dst + j;
444 av1_wiener_convolve_add_src(
445 src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
446 rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
447 }
448 }
449
450 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
451 over the input. The window is of size (2r + 1)x(2r + 1), and we
452 specialize to r = 1, 2, 3. A default function is used for r > 3.
453
454 Each loop follows the same format: We keep a window's worth of input
455 in individual variables and select data out of that as appropriate.
456 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)457 static void boxsum1(int32_t *src, int width, int height, int src_stride,
458 int sqr, int32_t *dst, int dst_stride) {
459 int i, j, a, b, c;
460 assert(width > 2 * SGRPROJ_BORDER_HORZ);
461 assert(height > 2 * SGRPROJ_BORDER_VERT);
462
463 // Vertical sum over 3-pixel regions, from src into dst.
464 if (!sqr) {
465 for (j = 0; j < width; ++j) {
466 a = src[j];
467 b = src[src_stride + j];
468 c = src[2 * src_stride + j];
469
470 dst[j] = a + b;
471 for (i = 1; i < height - 2; ++i) {
472 // Loop invariant: At the start of each iteration,
473 // a = src[(i - 1) * src_stride + j]
474 // b = src[(i ) * src_stride + j]
475 // c = src[(i + 1) * src_stride + j]
476 dst[i * dst_stride + j] = a + b + c;
477 a = b;
478 b = c;
479 c = src[(i + 2) * src_stride + j];
480 }
481 dst[i * dst_stride + j] = a + b + c;
482 dst[(i + 1) * dst_stride + j] = b + c;
483 }
484 } else {
485 for (j = 0; j < width; ++j) {
486 a = src[j] * src[j];
487 b = src[src_stride + j] * src[src_stride + j];
488 c = src[2 * src_stride + j] * src[2 * src_stride + j];
489
490 dst[j] = a + b;
491 for (i = 1; i < height - 2; ++i) {
492 dst[i * dst_stride + j] = a + b + c;
493 a = b;
494 b = c;
495 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
496 }
497 dst[i * dst_stride + j] = a + b + c;
498 dst[(i + 1) * dst_stride + j] = b + c;
499 }
500 }
501
502 // Horizontal sum over 3-pixel regions of dst
503 for (i = 0; i < height; ++i) {
504 a = dst[i * dst_stride];
505 b = dst[i * dst_stride + 1];
506 c = dst[i * dst_stride + 2];
507
508 dst[i * dst_stride] = a + b;
509 for (j = 1; j < width - 2; ++j) {
510 // Loop invariant: At the start of each iteration,
511 // a = src[i * src_stride + (j - 1)]
512 // b = src[i * src_stride + (j )]
513 // c = src[i * src_stride + (j + 1)]
514 dst[i * dst_stride + j] = a + b + c;
515 a = b;
516 b = c;
517 c = dst[i * dst_stride + (j + 2)];
518 }
519 dst[i * dst_stride + j] = a + b + c;
520 dst[i * dst_stride + (j + 1)] = b + c;
521 }
522 }
523
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)524 static void boxsum2(int32_t *src, int width, int height, int src_stride,
525 int sqr, int32_t *dst, int dst_stride) {
526 int i, j, a, b, c, d, e;
527 assert(width > 2 * SGRPROJ_BORDER_HORZ);
528 assert(height > 2 * SGRPROJ_BORDER_VERT);
529
530 // Vertical sum over 5-pixel regions, from src into dst.
531 if (!sqr) {
532 for (j = 0; j < width; ++j) {
533 a = src[j];
534 b = src[src_stride + j];
535 c = src[2 * src_stride + j];
536 d = src[3 * src_stride + j];
537 e = src[4 * src_stride + j];
538
539 dst[j] = a + b + c;
540 dst[dst_stride + j] = a + b + c + d;
541 for (i = 2; i < height - 3; ++i) {
542 // Loop invariant: At the start of each iteration,
543 // a = src[(i - 2) * src_stride + j]
544 // b = src[(i - 1) * src_stride + j]
545 // c = src[(i ) * src_stride + j]
546 // d = src[(i + 1) * src_stride + j]
547 // e = src[(i + 2) * src_stride + j]
548 dst[i * dst_stride + j] = a + b + c + d + e;
549 a = b;
550 b = c;
551 c = d;
552 d = e;
553 e = src[(i + 3) * src_stride + j];
554 }
555 dst[i * dst_stride + j] = a + b + c + d + e;
556 dst[(i + 1) * dst_stride + j] = b + c + d + e;
557 dst[(i + 2) * dst_stride + j] = c + d + e;
558 }
559 } else {
560 for (j = 0; j < width; ++j) {
561 a = src[j] * src[j];
562 b = src[src_stride + j] * src[src_stride + j];
563 c = src[2 * src_stride + j] * src[2 * src_stride + j];
564 d = src[3 * src_stride + j] * src[3 * src_stride + j];
565 e = src[4 * src_stride + j] * src[4 * src_stride + j];
566
567 dst[j] = a + b + c;
568 dst[dst_stride + j] = a + b + c + d;
569 for (i = 2; i < height - 3; ++i) {
570 dst[i * dst_stride + j] = a + b + c + d + e;
571 a = b;
572 b = c;
573 c = d;
574 d = e;
575 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
576 }
577 dst[i * dst_stride + j] = a + b + c + d + e;
578 dst[(i + 1) * dst_stride + j] = b + c + d + e;
579 dst[(i + 2) * dst_stride + j] = c + d + e;
580 }
581 }
582
583 // Horizontal sum over 5-pixel regions of dst
584 for (i = 0; i < height; ++i) {
585 a = dst[i * dst_stride];
586 b = dst[i * dst_stride + 1];
587 c = dst[i * dst_stride + 2];
588 d = dst[i * dst_stride + 3];
589 e = dst[i * dst_stride + 4];
590
591 dst[i * dst_stride] = a + b + c;
592 dst[i * dst_stride + 1] = a + b + c + d;
593 for (j = 2; j < width - 3; ++j) {
594 // Loop invariant: At the start of each iteration,
595 // a = src[i * src_stride + (j - 2)]
596 // b = src[i * src_stride + (j - 1)]
597 // c = src[i * src_stride + (j )]
598 // d = src[i * src_stride + (j + 1)]
599 // e = src[i * src_stride + (j + 2)]
600 dst[i * dst_stride + j] = a + b + c + d + e;
601 a = b;
602 b = c;
603 c = d;
604 d = e;
605 e = dst[i * dst_stride + (j + 3)];
606 }
607 dst[i * dst_stride + j] = a + b + c + d + e;
608 dst[i * dst_stride + (j + 1)] = b + c + d + e;
609 dst[i * dst_stride + (j + 2)] = c + d + e;
610 }
611 }
612
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)613 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
614 int sqr, int32_t *dst, int dst_stride) {
615 if (r == 1)
616 boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
617 else if (r == 2)
618 boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
619 else
620 assert(0 && "Invalid value of r in self-guided filter");
621 }
622
decode_xq(const int * xqd,int * xq,const sgr_params_type * params)623 void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
624 if (params->r[0] == 0) {
625 xq[0] = 0;
626 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
627 } else if (params->r[1] == 0) {
628 xq[0] = xqd[0];
629 xq[1] = 0;
630 } else {
631 xq[0] = xqd[0];
632 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
633 }
634 }
635
636 const int32_t x_by_xplus1[256] = {
637 // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
638 // instead of 0. See comments in selfguided_restoration_internal() for why
639 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
640 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
641 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
642 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
643 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
644 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
645 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
646 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
647 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
648 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
649 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
650 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
651 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
652 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
653 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
654 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
655 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
656 256,
657 };
658
659 const int32_t one_by_x[MAX_NELEM] = {
660 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
661 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
662 };
663
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)664 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
665 int dgd_stride, int bit_depth,
666 int sgr_params_idx, int radius_idx,
667 int pass, int32_t *A, int32_t *B) {
668 const sgr_params_type *const params = &sgr_params[sgr_params_idx];
669 const int r = params->r[radius_idx];
670 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
671 const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
672 // Adjusting the stride of A and B here appears to avoid bad cache effects,
673 // leading to a significant speed improvement.
674 // We also align the stride to a multiple of 16 bytes, for consistency
675 // with the SIMD version of this function.
676 int buf_stride = ((width_ext + 3) & ~3) + 16;
677 const int step = pass == 0 ? 1 : 2;
678 int i, j;
679
680 assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
681 assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
682 "Need SGRPROJ_BORDER_* >= r+1");
683
684 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
685 width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
686 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
687 width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
688 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
689 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
690 // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
691 // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
692 for (i = -1; i < height + 1; i += step) {
693 for (j = -1; j < width + 1; ++j) {
694 const int k = i * buf_stride + j;
695 const int n = (2 * r + 1) * (2 * r + 1);
696
697 // a < 2^16 * n < 2^22 regardless of bit depth
698 uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
699 // b < 2^8 * n < 2^14 regardless of bit depth
700 uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
701
702 // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
703 // and p itself satisfies p < 2^14 * n^2 < 2^26.
704 // This bound on p is due to:
705 // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
706 //
707 // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
708 // This is an artefact of rounding, and can only happen if all pixels
709 // are (almost) identical, so in this case we saturate to p=0.
710 uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
711
712 const uint32_t s = params->s[radius_idx];
713
714 // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
715 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
716 // (this holds even after accounting for the rounding in s)
717 const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
718
719 // Note: We have to be quite careful about the value of A[k].
720 // This is used as a blend factor between individual pixel values and the
721 // local mean. So it logically has a range of [0, 256], including both
722 // endpoints.
723 //
724 // This is a pain for hardware, as we'd like something which can be stored
725 // in exactly 8 bits.
726 // Further, in the calculation of B[k] below, if z == 0 and r == 2,
727 // then A[k] "should be" 0. But then we can end up setting B[k] to a value
728 // slightly above 2^(8 + bit depth), due to rounding in the value of
729 // one_by_x[25-1].
730 //
731 // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
732 // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
733 // overflow), without significantly affecting the final result: z == 0
734 // implies that the image is essentially "flat", so the local mean and
735 // individual pixel values are very similar.
736 //
737 // Note that saturating on the other side, ie. requring A[k] <= 255,
738 // would be a bad idea, as that corresponds to the case where the image
739 // is very variable, when we want to preserve the local pixel value as
740 // much as possible.
741 A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
742
743 // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
744 // one_by_x[n - 1] = round(2^12 / n)
745 // => the product here is < 2^(20 + bit_depth) <= 2^32,
746 // and B[k] is set to a value < 2^(8 + bit depth)
747 // This holds even with the rounding in one_by_x and in the overall
748 // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
749 B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
750 (uint32_t)B[k] *
751 (uint32_t)one_by_x[n - 1],
752 SGRPROJ_RECIP_BITS);
753 }
754 }
755 }
756
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)757 static void selfguided_restoration_fast_internal(
758 int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
759 int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
760 const sgr_params_type *const params = &sgr_params[sgr_params_idx];
761 const int r = params->r[radius_idx];
762 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
763 // Adjusting the stride of A and B here appears to avoid bad cache effects,
764 // leading to a significant speed improvement.
765 // We also align the stride to a multiple of 16 bytes, for consistency
766 // with the SIMD version of this function.
767 int buf_stride = ((width_ext + 3) & ~3) + 16;
768 int32_t A_[RESTORATION_PROC_UNIT_PELS];
769 int32_t B_[RESTORATION_PROC_UNIT_PELS];
770 int32_t *A = A_;
771 int32_t *B = B_;
772 int i, j;
773 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
774 sgr_params_idx, radius_idx, 1, A, B);
775 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
776 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
777
778 // Use the A[] and B[] arrays to calculate the filtered image
779 (void)r;
780 assert(r == 2);
781 for (i = 0; i < height; ++i) {
782 if (!(i & 1)) { // even row
783 for (j = 0; j < width; ++j) {
784 const int k = i * buf_stride + j;
785 const int l = i * dgd_stride + j;
786 const int m = i * dst_stride + j;
787 const int nb = 5;
788 const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
789 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
790 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
791 5;
792 const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
793 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
794 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
795 5;
796 const int32_t v = a * dgd[l] + b;
797 dst[m] =
798 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
799 }
800 } else { // odd row
801 for (j = 0; j < width; ++j) {
802 const int k = i * buf_stride + j;
803 const int l = i * dgd_stride + j;
804 const int m = i * dst_stride + j;
805 const int nb = 4;
806 const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
807 const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
808 const int32_t v = a * dgd[l] + b;
809 dst[m] =
810 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
811 }
812 }
813 }
814 }
815
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)816 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
817 int dgd_stride, int32_t *dst,
818 int dst_stride, int bit_depth,
819 int sgr_params_idx,
820 int radius_idx) {
821 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
822 // Adjusting the stride of A and B here appears to avoid bad cache effects,
823 // leading to a significant speed improvement.
824 // We also align the stride to a multiple of 16 bytes, for consistency
825 // with the SIMD version of this function.
826 int buf_stride = ((width_ext + 3) & ~3) + 16;
827 int32_t A_[RESTORATION_PROC_UNIT_PELS];
828 int32_t B_[RESTORATION_PROC_UNIT_PELS];
829 int32_t *A = A_;
830 int32_t *B = B_;
831 int i, j;
832 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
833 sgr_params_idx, radius_idx, 0, A, B);
834 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
835 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
836
837 // Use the A[] and B[] arrays to calculate the filtered image
838 for (i = 0; i < height; ++i) {
839 for (j = 0; j < width; ++j) {
840 const int k = i * buf_stride + j;
841 const int l = i * dgd_stride + j;
842 const int m = i * dst_stride + j;
843 const int nb = 5;
844 const int32_t a =
845 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
846 4 +
847 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
848 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
849 3;
850 const int32_t b =
851 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
852 4 +
853 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
854 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
855 3;
856 const int32_t v = a * dgd[l] + b;
857 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
858 }
859 }
860 }
861
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)862 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
863 int dgd_stride, int32_t *flt0, int32_t *flt1,
864 int flt_stride, int sgr_params_idx,
865 int bit_depth, int highbd) {
866 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
867 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
868 int32_t *dgd32 =
869 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
870
871 if (highbd) {
872 const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
873 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
874 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
875 dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
876 }
877 }
878 } else {
879 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
880 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
881 dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
882 }
883 }
884 }
885
886 const sgr_params_type *const params = &sgr_params[sgr_params_idx];
887 // If params->r == 0 we skip the corresponding filter. We only allow one of
888 // the radii to be 0, as having both equal to 0 would be equivalent to
889 // skipping SGR entirely.
890 assert(!(params->r[0] == 0 && params->r[1] == 0));
891
892 if (params->r[0] > 0)
893 selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
894 flt0, flt_stride, bit_depth,
895 sgr_params_idx, 0);
896 if (params->r[1] > 0)
897 selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
898 flt_stride, bit_depth, sgr_params_idx, 1);
899 return 0;
900 }
901
apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)902 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
903 int stride, int eps, const int *xqd,
904 uint8_t *dst8, int dst_stride,
905 int32_t *tmpbuf, int bit_depth,
906 int highbd) {
907 int32_t *flt0 = tmpbuf;
908 int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
909 assert(width * height <= RESTORATION_UNITPELS_MAX);
910
911 const int ret = av1_selfguided_restoration_c(
912 dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
913 (void)ret;
914 assert(!ret);
915 const sgr_params_type *const params = &sgr_params[eps];
916 int xq[2];
917 decode_xq(xqd, xq, params);
918 for (int i = 0; i < height; ++i) {
919 for (int j = 0; j < width; ++j) {
920 const int k = i * width + j;
921 uint8_t *dst8ij = dst8 + i * dst_stride + j;
922 const uint8_t *dat8ij = dat8 + i * stride + j;
923
924 const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
925 const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
926 int32_t v = u << SGRPROJ_PRJ_BITS;
927 // If params->r == 0 then we skipped the filtering in
928 // av1_selfguided_restoration_c, i.e. flt[k] == u
929 if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
930 if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
931 const int16_t w =
932 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
933
934 const uint16_t out = clip_pixel_highbd(w, bit_depth);
935 if (highbd)
936 *CONVERT_TO_SHORTPTR(dst8ij) = out;
937 else
938 *dst8ij = (uint8_t)out;
939 }
940 }
941 }
942
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)943 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
944 int stripe_width, int stripe_height,
945 int procunit_width, const uint8_t *src,
946 int src_stride, uint8_t *dst, int dst_stride,
947 int32_t *tmpbuf, int bit_depth) {
948 (void)bit_depth;
949 assert(bit_depth == 8);
950
951 for (int j = 0; j < stripe_width; j += procunit_width) {
952 int w = AOMMIN(procunit_width, stripe_width - j);
953 apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
954 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
955 dst + j, dst_stride, tmpbuf, bit_depth, 0);
956 }
957 }
958
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)959 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
960 int stripe_width, int stripe_height,
961 int procunit_width, const uint8_t *src8,
962 int src_stride, uint8_t *dst8,
963 int dst_stride, int32_t *tmpbuf,
964 int bit_depth) {
965 (void)tmpbuf;
966 const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
967
968 for (int j = 0; j < stripe_width; j += procunit_width) {
969 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
970 const uint8_t *src8_p = src8 + j;
971 uint8_t *dst8_p = dst8 + j;
972 av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
973 rui->wiener_info.hfilter, 16,
974 rui->wiener_info.vfilter, 16, w,
975 stripe_height, &conv_params, bit_depth);
976 }
977 }
978
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)979 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
980 int stripe_width, int stripe_height,
981 int procunit_width,
982 const uint8_t *src8, int src_stride,
983 uint8_t *dst8, int dst_stride,
984 int32_t *tmpbuf, int bit_depth) {
985 for (int j = 0; j < stripe_width; j += procunit_width) {
986 int w = AOMMIN(procunit_width, stripe_width - j);
987 apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
988 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
989 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
990 }
991 }
992
993 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
994 int stripe_width, int stripe_height,
995 int procunit_width, const uint8_t *src,
996 int src_stride, uint8_t *dst, int dst_stride,
997 int32_t *tmpbuf, int bit_depth);
998
999 #define NUM_STRIPE_FILTERS 4
1000
1001 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1002 wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1003 sgrproj_filter_stripe_highbd
1004 };
1005
1006 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,const AV1PixelRect * tile_rect,int tile_stripe0,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr)1007 void av1_loop_restoration_filter_unit(
1008 const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1009 const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1010 const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1011 int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1012 int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1013 RestorationType unit_rtype = rui->restoration_type;
1014
1015 int unit_h = limits->v_end - limits->v_start;
1016 int unit_w = limits->h_end - limits->h_start;
1017 uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1018 uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1019
1020 if (unit_rtype == RESTORE_NONE) {
1021 copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1022 return;
1023 }
1024
1025 const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1026 assert(filter_idx < NUM_STRIPE_FILTERS);
1027 const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1028
1029 const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1030
1031 // Convolve the whole tile one stripe at a time
1032 RestorationTileLimits remaining_stripes = *limits;
1033 int i = 0;
1034 while (i < unit_h) {
1035 int copy_above, copy_below;
1036 remaining_stripes.v_start = limits->v_start + i;
1037
1038 get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, ©_above,
1039 ©_below);
1040
1041 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1042 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1043
1044 // Work out where this stripe's boundaries are within
1045 // rsb->stripe_boundary_{above,below}
1046 const int tile_stripe =
1047 (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1048 full_stripe_height;
1049 const int frame_stripe = tile_stripe0 + tile_stripe;
1050 const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1051
1052 // Calculate this stripe's height, based on two rules:
1053 // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1054 // * We can't extend past the end of the current restoration unit
1055 const int nominal_stripe_height =
1056 full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1057 const int h = AOMMIN(nominal_stripe_height,
1058 remaining_stripes.v_end - remaining_stripes.v_start);
1059
1060 setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1061 h, data8, stride, rlbs, copy_above,
1062 copy_below, optimized_lr);
1063
1064 stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1065 dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1066
1067 restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1068 data8, stride, copy_above, copy_below,
1069 optimized_lr);
1070
1071 i += h;
1072 }
1073 }
1074
filter_frame_on_unit(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1075 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1076 const AV1PixelRect *tile_rect,
1077 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1078 RestorationLineBuffers *rlbs) {
1079 FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1080 const RestorationInfo *rsi = ctxt->rsi;
1081
1082 av1_loop_restoration_filter_unit(
1083 limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1084 ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1085 ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1086 rsi->optimized_lr);
1087 }
1088
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1089 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1090 YV12_BUFFER_CONFIG *frame,
1091 AV1_COMMON *cm, int optimized_lr,
1092 int num_planes) {
1093 const SequenceHeader *const seq_params = &cm->seq_params;
1094 const int bit_depth = seq_params->bit_depth;
1095 const int highbd = seq_params->use_highbitdepth;
1096 lr_ctxt->dst = &cm->rst_frame;
1097
1098 const int frame_width = frame->crop_widths[0];
1099 const int frame_height = frame->crop_heights[0];
1100 if (aom_realloc_frame_buffer(
1101 lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1102 seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1103 cm->byte_alignment, NULL, NULL, NULL) < 0)
1104 aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1105 "Failed to allocate restoration dst buffer");
1106
1107 lr_ctxt->on_rest_unit = filter_frame_on_unit;
1108 lr_ctxt->frame = frame;
1109 for (int plane = 0; plane < num_planes; ++plane) {
1110 RestorationInfo *rsi = &cm->rst_info[plane];
1111 RestorationType rtype = rsi->frame_restoration_type;
1112 rsi->optimized_lr = optimized_lr;
1113
1114 if (rtype == RESTORE_NONE) {
1115 continue;
1116 }
1117
1118 const int is_uv = plane > 0;
1119 const int plane_width = frame->crop_widths[is_uv];
1120 const int plane_height = frame->crop_heights[is_uv];
1121 FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1122
1123 extend_frame(frame->buffers[plane], plane_width, plane_height,
1124 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
1125 highbd);
1126
1127 lr_plane_ctxt->rsi = rsi;
1128 lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1129 lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1130 lr_plane_ctxt->highbd = highbd;
1131 lr_plane_ctxt->bit_depth = bit_depth;
1132 lr_plane_ctxt->data8 = frame->buffers[plane];
1133 lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1134 lr_plane_ctxt->data_stride = frame->strides[is_uv];
1135 lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1136 lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1137 lr_plane_ctxt->tile_stripe0 = 0;
1138 }
1139 }
1140
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1141 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1142 AV1_COMMON *cm, int num_planes) {
1143 typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1144 YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1145 int vstart, int vend);
1146 static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1147 aom_yv12_partial_coloc_copy_u,
1148 aom_yv12_partial_coloc_copy_v };
1149
1150 for (int plane = 0; plane < num_planes; ++plane) {
1151 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1152 AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1153 copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1154 tile_rect.right, tile_rect.top, tile_rect.bottom);
1155 }
1156 }
1157
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1158 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1159 int num_planes) {
1160 FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1161
1162 for (int plane = 0; plane < num_planes; ++plane) {
1163 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1164 continue;
1165 }
1166
1167 av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1168 &ctxt[plane], &ctxt[plane].tile_rect,
1169 cm->rst_tmpbuf, cm->rlbs);
1170 }
1171 }
1172
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1173 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1174 AV1_COMMON *cm, int optimized_lr,
1175 void *lr_ctxt) {
1176 assert(!cm->all_lossless);
1177 const int num_planes = av1_num_planes(cm);
1178
1179 AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1180
1181 av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1182 optimized_lr, num_planes);
1183
1184 foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1185
1186 av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1187 }
1188
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,const AV1PixelRect * tile_rect,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int unit_idx0,int hunits_per_tile,int vunits_per_tile,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync)1189 void av1_foreach_rest_unit_in_row(
1190 RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1191 rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1192 int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1193 void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1194 sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1195 struct AV1LrSyncData *const lr_sync) {
1196 const int tile_w = tile_rect->right - tile_rect->left;
1197 const int ext_size = unit_size * 3 / 2;
1198 int x0 = 0, j = 0;
1199 while (x0 < tile_w) {
1200 int remaining_w = tile_w - x0;
1201 int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1202
1203 limits->h_start = tile_rect->left + x0;
1204 limits->h_end = tile_rect->left + x0 + w;
1205 assert(limits->h_end <= tile_rect->right);
1206
1207 const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1208
1209 // No sync for even numbered rows
1210 // For odd numbered rows, Loop Restoration of current block requires the LR
1211 // of top-right and bottom-right blocks to be completed
1212
1213 // top-right sync
1214 on_sync_read(lr_sync, row_number, j, plane);
1215 if ((row_number + 1) < vunits_per_tile)
1216 // bottom-right sync
1217 on_sync_read(lr_sync, row_number + 2, j, plane);
1218
1219 on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1220
1221 on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1222
1223 x0 += w;
1224 ++j;
1225 }
1226 }
1227
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1228 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1229 (void)lr_sync;
1230 (void)r;
1231 (void)c;
1232 (void)plane;
1233 }
1234
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1235 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1236 const int sb_cols, int plane) {
1237 (void)lr_sync;
1238 (void)r;
1239 (void)c;
1240 (void)sb_cols;
1241 (void)plane;
1242 }
1243
foreach_rest_unit_in_tile(const AV1PixelRect * tile_rect,int tile_row,int tile_col,int tile_cols,int hunits_per_tile,int vunits_per_tile,int units_per_tile,int unit_size,int ss_y,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1244 static void foreach_rest_unit_in_tile(
1245 const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1246 int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1247 int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1248 int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1249 const int tile_h = tile_rect->bottom - tile_rect->top;
1250 const int ext_size = unit_size * 3 / 2;
1251
1252 const int tile_idx = tile_col + tile_row * tile_cols;
1253 const int unit_idx0 = tile_idx * units_per_tile;
1254
1255 int y0 = 0, i = 0;
1256 while (y0 < tile_h) {
1257 int remaining_h = tile_h - y0;
1258 int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1259
1260 RestorationTileLimits limits;
1261 limits.v_start = tile_rect->top + y0;
1262 limits.v_end = tile_rect->top + y0 + h;
1263 assert(limits.v_end <= tile_rect->bottom);
1264 // Offset the tile upwards to align with the restoration processing stripe
1265 const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1266 limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1267 if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1268
1269 av1_foreach_rest_unit_in_row(
1270 &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1271 hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1272 av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1273
1274 y0 += h;
1275 ++i;
1276 }
1277 }
1278
av1_foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,AV1PixelRect * tile_rect,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1279 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1280 rest_unit_visitor_t on_rest_unit,
1281 void *priv, AV1PixelRect *tile_rect,
1282 int32_t *tmpbuf,
1283 RestorationLineBuffers *rlbs) {
1284 const int is_uv = plane > 0;
1285 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1286
1287 const RestorationInfo *rsi = &cm->rst_info[plane];
1288
1289 foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1290 rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1291 rsi->units_per_tile, rsi->restoration_unit_size,
1292 ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1293 }
1294
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1295 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1296 int mi_row, int mi_col, BLOCK_SIZE bsize,
1297 int *rcol0, int *rcol1, int *rrow0,
1298 int *rrow1) {
1299 assert(rcol0 && rcol1 && rrow0 && rrow1);
1300
1301 if (bsize != cm->seq_params.sb_size) return 0;
1302 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1303
1304 assert(!cm->all_lossless);
1305
1306 const int is_uv = plane > 0;
1307
1308 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1309 const int tile_w = tile_rect.right - tile_rect.left;
1310 const int tile_h = tile_rect.bottom - tile_rect.top;
1311
1312 const int mi_top = 0;
1313 const int mi_left = 0;
1314
1315 // Compute the mi-unit corners of the superblock relative to the top-left of
1316 // the tile
1317 const int mi_rel_row0 = mi_row - mi_top;
1318 const int mi_rel_col0 = mi_col - mi_left;
1319 const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1320 const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1321
1322 const RestorationInfo *rsi = &cm->rst_info[plane];
1323 const int size = rsi->restoration_unit_size;
1324
1325 // Calculate the number of restoration units in this tile (which might be
1326 // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1327 const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1328 const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1329
1330 // The size of an MI-unit on this plane of the image
1331 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1332 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1333 const int mi_size_x = MI_SIZE >> ss_x;
1334 const int mi_size_y = MI_SIZE >> ss_y;
1335
1336 // Write m for the relative mi column or row, D for the superres denominator
1337 // and N for the superres numerator. If u is the upscaled pixel offset then
1338 // we can write the downscaled pixel offset in two ways as:
1339 //
1340 // MI_SIZE * m = N / D u
1341 //
1342 // from which we get u = D * MI_SIZE * m / N
1343 const int mi_to_num_x = av1_superres_scaled(cm)
1344 ? mi_size_x * cm->superres_scale_denominator
1345 : mi_size_x;
1346 const int mi_to_num_y = mi_size_y;
1347 const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1348 const int denom_y = size;
1349
1350 const int rnd_x = denom_x - 1;
1351 const int rnd_y = denom_y - 1;
1352
1353 // rcol0/rrow0 should be the first column/row of restoration units (relative
1354 // to the top-left of the tile) that doesn't start left/below of
1355 // mi_col/mi_row. For this calculation, we need to round up the division (if
1356 // the sb starts at runit column 10.1, the first matching runit has column
1357 // index 11)
1358 *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1359 *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1360
1361 // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1362 // below-right. If we're at the bottom or right of the tile, this restoration
1363 // unit might not exist, in which case we'll clamp accordingly.
1364 *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1365 *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1366
1367 return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1368 }
1369
1370 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1371 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1372 int extend, int use_highbitdepth) {
1373 for (int i = 0; i < height; ++i) {
1374 if (use_highbitdepth) {
1375 uint16_t *buf16 = (uint16_t *)buf;
1376 aom_memset16(buf16 - extend, buf16[0], extend);
1377 aom_memset16(buf16 + width, buf16[width - 1], extend);
1378 } else {
1379 memset(buf - extend, buf[0], extend);
1380 memset(buf + width, buf[width - 1], extend);
1381 }
1382 buf += stride;
1383 }
1384 }
1385
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1386 static void save_deblock_boundary_lines(
1387 const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1388 int stripe, int use_highbd, int is_above,
1389 RestorationStripeBoundaries *boundaries) {
1390 const int is_uv = plane > 0;
1391 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1392 const int src_stride = frame->strides[is_uv] << use_highbd;
1393 const uint8_t *src_rows = src_buf + row * src_stride;
1394
1395 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1396 : boundaries->stripe_boundary_below;
1397 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1398 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1399 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1400
1401 // There is a rare case in which a processing stripe can end 1px above the
1402 // crop border. In this case, we do want to use deblocked pixels from below
1403 // the stripe (hence why we ended up in this function), but instead of
1404 // fetching 2 "below" rows we need to fetch one and duplicate it.
1405 // This is equivalent to clamping the sample locations against the crop border
1406 const int lines_to_save =
1407 AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1408 assert(lines_to_save == 1 || lines_to_save == 2);
1409
1410 int upscaled_width;
1411 int line_bytes;
1412 if (av1_superres_scaled(cm)) {
1413 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1414 upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1415 line_bytes = upscaled_width << use_highbd;
1416 if (use_highbd)
1417 av1_upscale_normative_rows(
1418 cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1419 CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1420 plane, lines_to_save);
1421 else
1422 av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1423 boundaries->stripe_boundary_stride, plane,
1424 lines_to_save);
1425 } else {
1426 upscaled_width = frame->crop_widths[is_uv];
1427 line_bytes = upscaled_width << use_highbd;
1428 for (int i = 0; i < lines_to_save; i++) {
1429 memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1430 line_bytes);
1431 }
1432 }
1433 // If we only saved one line, then copy it into the second line buffer
1434 if (lines_to_save == 1)
1435 memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1436
1437 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1438 RESTORATION_EXTRA_HORZ, use_highbd);
1439 }
1440
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1441 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1442 const AV1_COMMON *cm, int plane, int row,
1443 int stripe, int use_highbd, int is_above,
1444 RestorationStripeBoundaries *boundaries) {
1445 const int is_uv = plane > 0;
1446 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1447 const int src_stride = frame->strides[is_uv] << use_highbd;
1448 const uint8_t *src_rows = src_buf + row * src_stride;
1449
1450 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1451 : boundaries->stripe_boundary_below;
1452 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1453 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1454 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1455 const int src_width = frame->crop_widths[is_uv];
1456
1457 // At the point where this function is called, we've already applied
1458 // superres. So we don't need to extend the lines here, we can just
1459 // pull directly from the topmost row of the upscaled frame.
1460 const int ss_x = is_uv && cm->seq_params.subsampling_x;
1461 const int upscaled_width = av1_superres_scaled(cm)
1462 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1463 : src_width;
1464 const int line_bytes = upscaled_width << use_highbd;
1465 for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1466 // Copy the line at 'row' into both context lines. This is because
1467 // we want to (effectively) extend the outermost row of CDEF data
1468 // from this tile to produce a border, rather than using deblocked
1469 // pixels from the tile above/below.
1470 memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1471 }
1472 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1473 RESTORATION_EXTRA_HORZ, use_highbd);
1474 }
1475
save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1476 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1477 int use_highbd, int plane,
1478 AV1_COMMON *cm, int after_cdef) {
1479 const int is_uv = plane > 0;
1480 const int ss_y = is_uv && cm->seq_params.subsampling_y;
1481 const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1482 const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1483
1484 // Get the tile rectangle, with height rounded up to the next multiple of 8
1485 // luma pixels (only relevant for the bottom tile of the frame)
1486 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1487 const int stripe0 = 0;
1488
1489 RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1490
1491 const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1492
1493 int tile_stripe;
1494 for (tile_stripe = 0;; ++tile_stripe) {
1495 const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1496 const int y0 = tile_rect.top + rel_y0;
1497 if (y0 >= tile_rect.bottom) break;
1498
1499 const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1500 const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1501
1502 const int frame_stripe = stripe0 + tile_stripe;
1503
1504 // In this case, we should only use CDEF pixels at the top
1505 // and bottom of the frame as a whole; internal tile boundaries
1506 // can use deblocked pixels from adjacent tiles for context.
1507 const int use_deblock_above = (frame_stripe > 0);
1508 const int use_deblock_below = (y1 < plane_height);
1509
1510 if (!after_cdef) {
1511 // Save deblocked context where needed.
1512 if (use_deblock_above) {
1513 save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1514 frame_stripe, use_highbd, 1, boundaries);
1515 }
1516 if (use_deblock_below) {
1517 save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1518 use_highbd, 0, boundaries);
1519 }
1520 } else {
1521 // Save CDEF context where needed. Note that we need to save the CDEF
1522 // context for a particular boundary iff we *didn't* save deblocked
1523 // context for that boundary.
1524 //
1525 // In addition, we need to save copies of the outermost line within
1526 // the tile, rather than using data from outside the tile.
1527 if (!use_deblock_above) {
1528 save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1529 1, boundaries);
1530 }
1531 if (!use_deblock_below) {
1532 save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1533 use_highbd, 0, boundaries);
1534 }
1535 }
1536 }
1537 }
1538
1539 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1540 // lines to be used as boundary in the loop restoration process. The
1541 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1542 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1543 AV1_COMMON *cm, int after_cdef) {
1544 const int num_planes = av1_num_planes(cm);
1545 const int use_highbd = cm->seq_params.use_highbitdepth;
1546 for (int p = 0; p < num_planes; ++p) {
1547 save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1548 }
1549 }
1550