• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  *
11  */
12 
13 #include <math.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom_mem/aom_mem.h"
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/resize.h"
22 #include "av1/common/restoration.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_mem/aom_mem.h"
25 
26 #include "aom_ports/mem.h"
27 
28 // The 's' values are calculated based on original 'r' and 'e' values in the
29 // spec using GenSgrprojVtable().
30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31 const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
32   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40 };
41 
av1_whole_frame_rect(const AV1_COMMON * cm,int is_uv)42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43   AV1PixelRect rect;
44 
45   int ss_x = is_uv && cm->seq_params.subsampling_x;
46   int ss_y = is_uv && cm->seq_params.subsampling_y;
47 
48   rect.top = 0;
49   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50   rect.left = 0;
51   rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52   return rect;
53 }
54 
55 // Count horizontal or vertical units per tile (use a width or height for
56 // tile_size, respectively). We basically want to divide the tile size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height. The
60 // max with 1 is to deal with tiles that are smaller than half of a restoration
61 // unit.
av1_lr_count_units_in_tile(int unit_size,int tile_size)62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63   return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64 }
65 
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67                                   int is_uv) {
68   // We need to allocate enough space for restoration units to cover the
69   // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70   // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71   // to do the computation ourselves, iterating over the tiles and keeping
72   // track of the largest width and height, then upscaling.
73   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74   const int max_tile_w = tile_rect.right - tile_rect.left;
75   const int max_tile_h = tile_rect.bottom - tile_rect.top;
76 
77   // To calculate hpertile and vpertile (horizontal and vertical units per
78   // tile), we basically want to divide the largest tile width or height by the
79   // size of a restoration unit. Rather than rounding up unconditionally as you
80   // might expect, we round to nearest, which models the way a right or bottom
81   // restoration unit can extend to up to 150% its normal width or height. The
82   // max with 1 is to deal with tiles that are smaller than half of a
83   // restoration unit.
84   const int unit_size = rsi->restoration_unit_size;
85   const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86   const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87 
88   rsi->units_per_tile = hpertile * vpertile;
89   rsi->horz_units_per_tile = hpertile;
90   rsi->vert_units_per_tile = vpertile;
91 
92   const int ntiles = 1;
93   const int nunits = ntiles * rsi->units_per_tile;
94 
95   aom_free(rsi->unit_info);
96   CHECK_MEM_ERROR(cm, rsi->unit_info,
97                   (RestorationUnitInfo *)aom_memalign(
98                       16, sizeof(*rsi->unit_info) * nunits));
99 }
100 
av1_free_restoration_struct(RestorationInfo * rst_info)101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
102   aom_free(rst_info->unit_info);
103   rst_info->unit_info = NULL;
104 }
105 
106 #if 0
107 // Pair of values for each sgrproj parameter:
108 // Index 0 corresponds to r[0], e[0]
109 // Index 1 corresponds to r[1], e[1]
110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
111 
112 static void GenSgrprojVtable() {
113   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114     const sgr_params_type *const params = &av1_sgr_params[i];
115     for (int j = 0; j < 2; ++j) {
116       const int e = params->e[j];
117       const int r = params->r[j];
118       if (r == 0) {                 // filter is disabled
119         sgrproj_mtable[i][j] = -1;  // mark invalid
120       } else {                      // filter is enabled
121         const int n = (2 * r + 1) * (2 * r + 1);
122         const int n2e = n * n * e;
123         assert(n2e != 0);
124         sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125       }
126     }
127   }
128 }
129 #endif
130 
av1_loop_restoration_precal()131 void av1_loop_restoration_precal() {
132 #if 0
133   GenSgrprojVtable();
134 #endif
135 }
136 
extend_frame_lowbd(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138                                int border_horz, int border_vert) {
139   uint8_t *data_p;
140   int i;
141   for (i = 0; i < height; ++i) {
142     data_p = data + i * stride;
143     memset(data_p - border_horz, data_p[0], border_horz);
144     memset(data_p + width, data_p[width - 1], border_horz);
145   }
146   data_p = data - border_horz;
147   for (i = -border_vert; i < 0; ++i) {
148     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149   }
150   for (i = height; i < height + border_vert; ++i) {
151     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152            width + 2 * border_horz);
153   }
154 }
155 
156 #if CONFIG_AV1_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)157 static void extend_frame_highbd(uint16_t *data, int width, int height,
158                                 int stride, int border_horz, int border_vert) {
159   uint16_t *data_p;
160   int i, j;
161   for (i = 0; i < height; ++i) {
162     data_p = data + i * stride;
163     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
164     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
165   }
166   data_p = data - border_horz;
167   for (i = -border_vert; i < 0; ++i) {
168     memcpy(data_p + i * stride, data_p,
169            (width + 2 * border_horz) * sizeof(uint16_t));
170   }
171   for (i = height; i < height + border_vert; ++i) {
172     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
173            (width + 2 * border_horz) * sizeof(uint16_t));
174   }
175 }
176 
copy_tile_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)177 static void copy_tile_highbd(int width, int height, const uint16_t *src,
178                              int src_stride, uint16_t *dst, int dst_stride) {
179   for (int i = 0; i < height; ++i)
180     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
181 }
182 #endif
183 
av1_extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)184 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
185                       int border_horz, int border_vert, int highbd) {
186 #if CONFIG_AV1_HIGHBITDEPTH
187   if (highbd) {
188     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
189                         border_horz, border_vert);
190     return;
191   }
192 #endif
193   (void)highbd;
194   extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
195 }
196 
copy_tile_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)197 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
198                             int src_stride, uint8_t *dst, int dst_stride) {
199   for (int i = 0; i < height; ++i)
200     memcpy(dst + i * dst_stride, src + i * src_stride, width);
201 }
202 
copy_tile(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)203 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
204                       uint8_t *dst, int dst_stride, int highbd) {
205 #if CONFIG_AV1_HIGHBITDEPTH
206   if (highbd) {
207     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
208                      CONVERT_TO_SHORTPTR(dst), dst_stride);
209     return;
210   }
211 #endif
212   (void)highbd;
213   copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
214 }
215 
216 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
217 
218 // With striped loop restoration, the filtering for each 64-pixel stripe gets
219 // most of its input from the output of CDEF (stored in data8), but we need to
220 // fill out a border of 3 pixels above/below the stripe according to the
221 // following
222 // rules:
223 //
224 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
225 //   This extension is done by a call to av1_extend_frame() at the start of the
226 //   loop restoration process, so the value of copy_above/copy_below doesn't
227 //   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
228 //   loop filtering across tiles is disabled, we can allow
229 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
230 //   data has always been copied, simplifying the behaviour at the left and
231 //   right edges of tiles.
232 //
233 // * If we're at a tile boundary and loop filtering across tiles is enabled,
234 //   then there is a logical stripe which is 64 pixels high, but which is split
235 //   into an 8px high and a 56px high stripe so that the processing (and
236 //   coefficient set usage) can be aligned to tiles.
237 //   In this case, we use the 3 rows of CDEF output across the boundary for
238 //   context; this corresponds to leaving the frame buffer as-is.
239 //
240 // * If we're at a tile boundary and loop filtering across tiles is disabled,
241 //   then we take the outermost row of CDEF pixels *within the current tile*
242 //   and copy it three times. Thus we behave exactly as if the tile were a full
243 //   frame.
244 //
245 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
246 //   take 2 rows of deblocked pixels and extend them to 3 rows of context.
247 //
248 // The distinction between the latter two cases is handled by the
249 // av1_loop_restoration_save_boundary_lines() function, so here we just need
250 // to decide if we're overwriting the above/below boundary pixels or not.
get_stripe_boundary_info(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int ss_y,int * copy_above,int * copy_below)251 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
252                                      const AV1PixelRect *tile_rect, int ss_y,
253                                      int *copy_above, int *copy_below) {
254   *copy_above = 1;
255   *copy_below = 1;
256 
257   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
258   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
259 
260   const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
261   const int this_stripe_height =
262       full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
263   const int last_stripe_in_tile =
264       (limits->v_start + this_stripe_height >= tile_rect->bottom);
265 
266   if (first_stripe_in_tile) *copy_above = 0;
267   if (last_stripe_in_tile) *copy_below = 0;
268 }
269 
270 // Overwrite the border pixels around a processing stripe so that the conditions
271 // listed above get_stripe_boundary_info() are preserved.
272 // We save the pixels which get overwritten into a temporary buffer, so that
273 // they can be restored by restore_processing_stripe_boundary() after we've
274 // processed the stripe.
275 //
276 // limits gives the rectangular limits of the remaining stripes for the current
277 // restoration unit. rsb is the stored stripe boundaries (taken from either
278 // deblock or CDEF output as necessary).
279 //
280 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
281 // the first stripe in this tile (needed to convert the tile-relative stripe
282 // index we get from limits into something we can look up in rsb).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)283 static void setup_processing_stripe_boundary(
284     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
285     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
286     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
287   // Offsets within the line buffers. The buffer logically starts at column
288   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
289   // has column x0 in the buffer.
290   const int buf_stride = rsb->stripe_boundary_stride;
291   const int buf_x0_off = limits->h_start;
292   const int line_width =
293       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
294   const int line_size = line_width << use_highbd;
295 
296   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
297 
298   // Replace RESTORATION_BORDER pixels above the top of the stripe
299   // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
300   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
301   // duplicating the topmost of the 2 lines (see the AOMMAX call when
302   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
303   //
304   // Special case: If we're at the top of a tile, which isn't on the topmost
305   // tile row, and we're allowed to loop filter across tiles, then we have a
306   // logical 64-pixel-high stripe which has been split into an 8-pixel high
307   // stripe and a 56-pixel high stripe (the current one). So, in this case,
308   // we want to leave the boundary alone!
309   if (!opt) {
310     if (copy_above) {
311       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
312 
313       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
314         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
315         const int buf_off = buf_x0_off + buf_row * buf_stride;
316         const uint8_t *buf =
317             rsb->stripe_boundary_above + (buf_off << use_highbd);
318         uint8_t *dst8 = data8_tl + i * data_stride;
319         // Save old pixels, then replace with data from stripe_boundary_above
320         memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
321                REAL_PTR(use_highbd, dst8), line_size);
322         memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
323       }
324     }
325 
326     // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
327     // The second buffer row is repeated, so src_row gets the values 0, 1, 1
328     // for i = 0, 1, 2.
329     if (copy_below) {
330       const int stripe_end = limits->v_start + h;
331       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
332 
333       for (int i = 0; i < RESTORATION_BORDER; ++i) {
334         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
335         const int buf_off = buf_x0_off + buf_row * buf_stride;
336         const uint8_t *src =
337             rsb->stripe_boundary_below + (buf_off << use_highbd);
338 
339         uint8_t *dst8 = data8_bl + i * data_stride;
340         // Save old pixels, then replace with data from stripe_boundary_below
341         memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
342         memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
343       }
344     }
345   } else {
346     if (copy_above) {
347       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
348 
349       // Only save and overwrite i=-RESTORATION_BORDER line.
350       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
351       // Save old pixels, then replace with data from stripe_boundary_above
352       memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
353       memcpy(REAL_PTR(use_highbd, dst8),
354              REAL_PTR(use_highbd,
355                       data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
356              line_size);
357     }
358 
359     if (copy_below) {
360       const int stripe_end = limits->v_start + h;
361       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
362 
363       // Only save and overwrite i=2 line.
364       uint8_t *dst8 = data8_bl + 2 * data_stride;
365       // Save old pixels, then replace with data from stripe_boundary_below
366       memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
367       memcpy(REAL_PTR(use_highbd, dst8),
368              REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
369     }
370   }
371 }
372 
373 // This function restores the boundary lines modified by
374 // setup_processing_stripe_boundary.
375 //
376 // Note: We need to be careful when handling the corners of the processing
377 // unit, because (eg.) the top-left corner is considered to be part of
378 // both the left and top borders. This means that, depending on the
379 // loop_filter_across_tiles_enabled flag, the corner pixels might get
380 // overwritten twice, once as part of the "top" border and once as part
381 // of the "left" border (or similar for other corners).
382 //
383 // Everything works out fine as long as we make sure to reverse the order
384 // when restoring, ie. we need to restore the left/right borders followed
385 // by the top/bottom borders.
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)386 static void restore_processing_stripe_boundary(
387     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
388     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
389     int copy_below, int opt) {
390   const int line_width =
391       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
392   const int line_size = line_width << use_highbd;
393 
394   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
395 
396   if (!opt) {
397     if (copy_above) {
398       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
399       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
400         uint8_t *dst8 = data8_tl + i * data_stride;
401         memcpy(REAL_PTR(use_highbd, dst8),
402                rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
403       }
404     }
405 
406     if (copy_below) {
407       const int stripe_bottom = limits->v_start + h;
408       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
409 
410       for (int i = 0; i < RESTORATION_BORDER; ++i) {
411         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
412 
413         uint8_t *dst8 = data8_bl + i * data_stride;
414         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
415       }
416     }
417   } else {
418     if (copy_above) {
419       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
420 
421       // Only restore i=-RESTORATION_BORDER line.
422       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
423       memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
424     }
425 
426     if (copy_below) {
427       const int stripe_bottom = limits->v_start + h;
428       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
429 
430       // Only restore i=2 line.
431       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
432         uint8_t *dst8 = data8_bl + 2 * data_stride;
433         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
434       }
435     }
436   }
437 }
438 
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)439 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
440                                  int stripe_width, int stripe_height,
441                                  int procunit_width, const uint8_t *src,
442                                  int src_stride, uint8_t *dst, int dst_stride,
443                                  int32_t *tmpbuf, int bit_depth) {
444   (void)tmpbuf;
445   (void)bit_depth;
446   assert(bit_depth == 8);
447   const ConvolveParams conv_params = get_conv_params_wiener(8);
448 
449   for (int j = 0; j < stripe_width; j += procunit_width) {
450     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
451     const uint8_t *src_p = src + j;
452     uint8_t *dst_p = dst + j;
453     av1_wiener_convolve_add_src(
454         src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
455         rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
456   }
457 }
458 
459 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
460    over the input. The window is of size (2r + 1)x(2r + 1), and we
461    specialize to r = 1, 2, 3. A default function is used for r > 3.
462 
463    Each loop follows the same format: We keep a window's worth of input
464    in individual variables and select data out of that as appropriate.
465 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)466 static void boxsum1(int32_t *src, int width, int height, int src_stride,
467                     int sqr, int32_t *dst, int dst_stride) {
468   int i, j, a, b, c;
469   assert(width > 2 * SGRPROJ_BORDER_HORZ);
470   assert(height > 2 * SGRPROJ_BORDER_VERT);
471 
472   // Vertical sum over 3-pixel regions, from src into dst.
473   if (!sqr) {
474     for (j = 0; j < width; ++j) {
475       a = src[j];
476       b = src[src_stride + j];
477       c = src[2 * src_stride + j];
478 
479       dst[j] = a + b;
480       for (i = 1; i < height - 2; ++i) {
481         // Loop invariant: At the start of each iteration,
482         // a = src[(i - 1) * src_stride + j]
483         // b = src[(i    ) * src_stride + j]
484         // c = src[(i + 1) * src_stride + j]
485         dst[i * dst_stride + j] = a + b + c;
486         a = b;
487         b = c;
488         c = src[(i + 2) * src_stride + j];
489       }
490       dst[i * dst_stride + j] = a + b + c;
491       dst[(i + 1) * dst_stride + j] = b + c;
492     }
493   } else {
494     for (j = 0; j < width; ++j) {
495       a = src[j] * src[j];
496       b = src[src_stride + j] * src[src_stride + j];
497       c = src[2 * src_stride + j] * src[2 * src_stride + j];
498 
499       dst[j] = a + b;
500       for (i = 1; i < height - 2; ++i) {
501         dst[i * dst_stride + j] = a + b + c;
502         a = b;
503         b = c;
504         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
505       }
506       dst[i * dst_stride + j] = a + b + c;
507       dst[(i + 1) * dst_stride + j] = b + c;
508     }
509   }
510 
511   // Horizontal sum over 3-pixel regions of dst
512   for (i = 0; i < height; ++i) {
513     a = dst[i * dst_stride];
514     b = dst[i * dst_stride + 1];
515     c = dst[i * dst_stride + 2];
516 
517     dst[i * dst_stride] = a + b;
518     for (j = 1; j < width - 2; ++j) {
519       // Loop invariant: At the start of each iteration,
520       // a = src[i * src_stride + (j - 1)]
521       // b = src[i * src_stride + (j    )]
522       // c = src[i * src_stride + (j + 1)]
523       dst[i * dst_stride + j] = a + b + c;
524       a = b;
525       b = c;
526       c = dst[i * dst_stride + (j + 2)];
527     }
528     dst[i * dst_stride + j] = a + b + c;
529     dst[i * dst_stride + (j + 1)] = b + c;
530   }
531 }
532 
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)533 static void boxsum2(int32_t *src, int width, int height, int src_stride,
534                     int sqr, int32_t *dst, int dst_stride) {
535   int i, j, a, b, c, d, e;
536   assert(width > 2 * SGRPROJ_BORDER_HORZ);
537   assert(height > 2 * SGRPROJ_BORDER_VERT);
538 
539   // Vertical sum over 5-pixel regions, from src into dst.
540   if (!sqr) {
541     for (j = 0; j < width; ++j) {
542       a = src[j];
543       b = src[src_stride + j];
544       c = src[2 * src_stride + j];
545       d = src[3 * src_stride + j];
546       e = src[4 * src_stride + j];
547 
548       dst[j] = a + b + c;
549       dst[dst_stride + j] = a + b + c + d;
550       for (i = 2; i < height - 3; ++i) {
551         // Loop invariant: At the start of each iteration,
552         // a = src[(i - 2) * src_stride + j]
553         // b = src[(i - 1) * src_stride + j]
554         // c = src[(i    ) * src_stride + j]
555         // d = src[(i + 1) * src_stride + j]
556         // e = src[(i + 2) * src_stride + j]
557         dst[i * dst_stride + j] = a + b + c + d + e;
558         a = b;
559         b = c;
560         c = d;
561         d = e;
562         e = src[(i + 3) * src_stride + j];
563       }
564       dst[i * dst_stride + j] = a + b + c + d + e;
565       dst[(i + 1) * dst_stride + j] = b + c + d + e;
566       dst[(i + 2) * dst_stride + j] = c + d + e;
567     }
568   } else {
569     for (j = 0; j < width; ++j) {
570       a = src[j] * src[j];
571       b = src[src_stride + j] * src[src_stride + j];
572       c = src[2 * src_stride + j] * src[2 * src_stride + j];
573       d = src[3 * src_stride + j] * src[3 * src_stride + j];
574       e = src[4 * src_stride + j] * src[4 * src_stride + j];
575 
576       dst[j] = a + b + c;
577       dst[dst_stride + j] = a + b + c + d;
578       for (i = 2; i < height - 3; ++i) {
579         dst[i * dst_stride + j] = a + b + c + d + e;
580         a = b;
581         b = c;
582         c = d;
583         d = e;
584         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
585       }
586       dst[i * dst_stride + j] = a + b + c + d + e;
587       dst[(i + 1) * dst_stride + j] = b + c + d + e;
588       dst[(i + 2) * dst_stride + j] = c + d + e;
589     }
590   }
591 
592   // Horizontal sum over 5-pixel regions of dst
593   for (i = 0; i < height; ++i) {
594     a = dst[i * dst_stride];
595     b = dst[i * dst_stride + 1];
596     c = dst[i * dst_stride + 2];
597     d = dst[i * dst_stride + 3];
598     e = dst[i * dst_stride + 4];
599 
600     dst[i * dst_stride] = a + b + c;
601     dst[i * dst_stride + 1] = a + b + c + d;
602     for (j = 2; j < width - 3; ++j) {
603       // Loop invariant: At the start of each iteration,
604       // a = src[i * src_stride + (j - 2)]
605       // b = src[i * src_stride + (j - 1)]
606       // c = src[i * src_stride + (j    )]
607       // d = src[i * src_stride + (j + 1)]
608       // e = src[i * src_stride + (j + 2)]
609       dst[i * dst_stride + j] = a + b + c + d + e;
610       a = b;
611       b = c;
612       c = d;
613       d = e;
614       e = dst[i * dst_stride + (j + 3)];
615     }
616     dst[i * dst_stride + j] = a + b + c + d + e;
617     dst[i * dst_stride + (j + 1)] = b + c + d + e;
618     dst[i * dst_stride + (j + 2)] = c + d + e;
619   }
620 }
621 
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)622 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
623                    int sqr, int32_t *dst, int dst_stride) {
624   if (r == 1)
625     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
626   else if (r == 2)
627     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
628   else
629     assert(0 && "Invalid value of r in self-guided filter");
630 }
631 
av1_decode_xq(const int * xqd,int * xq,const sgr_params_type * params)632 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
633   if (params->r[0] == 0) {
634     xq[0] = 0;
635     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
636   } else if (params->r[1] == 0) {
637     xq[0] = xqd[0];
638     xq[1] = 0;
639   } else {
640     xq[0] = xqd[0];
641     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
642   }
643 }
644 
645 const int32_t av1_x_by_xplus1[256] = {
646   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
647   // instead of 0. See comments in selfguided_restoration_internal() for why
648   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
649   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
650   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
651   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
652   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
653   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
654   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
655   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
656   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
657   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
658   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
659   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
660   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
661   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
662   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
663   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
664   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
665   256,
666 };
667 
668 const int32_t av1_one_by_x[MAX_NELEM] = {
669   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
670   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
671 };
672 
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)673 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
674                                           int dgd_stride, int bit_depth,
675                                           int sgr_params_idx, int radius_idx,
676                                           int pass, int32_t *A, int32_t *B) {
677   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
678   const int r = params->r[radius_idx];
679   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
680   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
681   // Adjusting the stride of A and B here appears to avoid bad cache effects,
682   // leading to a significant speed improvement.
683   // We also align the stride to a multiple of 16 bytes, for consistency
684   // with the SIMD version of this function.
685   int buf_stride = ((width_ext + 3) & ~3) + 16;
686   const int step = pass == 0 ? 1 : 2;
687   int i, j;
688 
689   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
690   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
691          "Need SGRPROJ_BORDER_* >= r+1");
692 
693   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
694          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
695   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
696          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
697   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
698   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
699   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
700   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
701   for (i = -1; i < height + 1; i += step) {
702     for (j = -1; j < width + 1; ++j) {
703       const int k = i * buf_stride + j;
704       const int n = (2 * r + 1) * (2 * r + 1);
705 
706       // a < 2^16 * n < 2^22 regardless of bit depth
707       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
708       // b < 2^8 * n < 2^14 regardless of bit depth
709       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
710 
711       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
712       // and p itself satisfies p < 2^14 * n^2 < 2^26.
713       // This bound on p is due to:
714       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
715       //
716       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
717       // This is an artefact of rounding, and can only happen if all pixels
718       // are (almost) identical, so in this case we saturate to p=0.
719       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
720 
721       const uint32_t s = params->s[radius_idx];
722 
723       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
724       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
725       // (this holds even after accounting for the rounding in s)
726       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
727 
728       // Note: We have to be quite careful about the value of A[k].
729       // This is used as a blend factor between individual pixel values and the
730       // local mean. So it logically has a range of [0, 256], including both
731       // endpoints.
732       //
733       // This is a pain for hardware, as we'd like something which can be stored
734       // in exactly 8 bits.
735       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
736       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
737       // slightly above 2^(8 + bit depth), due to rounding in the value of
738       // av1_one_by_x[25-1].
739       //
740       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
741       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
742       // overflow), without significantly affecting the final result: z == 0
743       // implies that the image is essentially "flat", so the local mean and
744       // individual pixel values are very similar.
745       //
746       // Note that saturating on the other side, ie. requring A[k] <= 255,
747       // would be a bad idea, as that corresponds to the case where the image
748       // is very variable, when we want to preserve the local pixel value as
749       // much as possible.
750       A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
751 
752       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
753       // av1_one_by_x[n - 1] = round(2^12 / n)
754       // => the product here is < 2^(20 + bit_depth) <= 2^32,
755       // and B[k] is set to a value < 2^(8 + bit depth)
756       // This holds even with the rounding in av1_one_by_x and in the overall
757       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
758       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
759                                              (uint32_t)B[k] *
760                                              (uint32_t)av1_one_by_x[n - 1],
761                                          SGRPROJ_RECIP_BITS);
762     }
763   }
764 }
765 
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)766 static void selfguided_restoration_fast_internal(
767     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
768     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
769   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
770   const int r = params->r[radius_idx];
771   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772   // Adjusting the stride of A and B here appears to avoid bad cache effects,
773   // leading to a significant speed improvement.
774   // We also align the stride to a multiple of 16 bytes, for consistency
775   // with the SIMD version of this function.
776   int buf_stride = ((width_ext + 3) & ~3) + 16;
777   int32_t A_[RESTORATION_PROC_UNIT_PELS];
778   int32_t B_[RESTORATION_PROC_UNIT_PELS];
779   int32_t *A = A_;
780   int32_t *B = B_;
781   int i, j;
782   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
783                                 sgr_params_idx, radius_idx, 1, A, B);
784   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
785   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
786 
787   // Use the A[] and B[] arrays to calculate the filtered image
788   (void)r;
789   assert(r == 2);
790   for (i = 0; i < height; ++i) {
791     if (!(i & 1)) {  // even row
792       for (j = 0; j < width; ++j) {
793         const int k = i * buf_stride + j;
794         const int l = i * dgd_stride + j;
795         const int m = i * dst_stride + j;
796         const int nb = 5;
797         const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
798                           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
799                            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
800                               5;
801         const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
802                           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
803                            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
804                               5;
805         const int32_t v = a * dgd[l] + b;
806         dst[m] =
807             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
808       }
809     } else {  // odd row
810       for (j = 0; j < width; ++j) {
811         const int k = i * buf_stride + j;
812         const int l = i * dgd_stride + j;
813         const int m = i * dst_stride + j;
814         const int nb = 4;
815         const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
816         const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
817         const int32_t v = a * dgd[l] + b;
818         dst[m] =
819             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
820       }
821     }
822   }
823 }
824 
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)825 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
826                                             int dgd_stride, int32_t *dst,
827                                             int dst_stride, int bit_depth,
828                                             int sgr_params_idx,
829                                             int radius_idx) {
830   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
831   // Adjusting the stride of A and B here appears to avoid bad cache effects,
832   // leading to a significant speed improvement.
833   // We also align the stride to a multiple of 16 bytes, for consistency
834   // with the SIMD version of this function.
835   int buf_stride = ((width_ext + 3) & ~3) + 16;
836   int32_t A_[RESTORATION_PROC_UNIT_PELS];
837   int32_t B_[RESTORATION_PROC_UNIT_PELS];
838   int32_t *A = A_;
839   int32_t *B = B_;
840   int i, j;
841   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
842                                 sgr_params_idx, radius_idx, 0, A, B);
843   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
844   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
845 
846   // Use the A[] and B[] arrays to calculate the filtered image
847   for (i = 0; i < height; ++i) {
848     for (j = 0; j < width; ++j) {
849       const int k = i * buf_stride + j;
850       const int l = i * dgd_stride + j;
851       const int m = i * dst_stride + j;
852       const int nb = 5;
853       const int32_t a =
854           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
855               4 +
856           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
857            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
858               3;
859       const int32_t b =
860           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
861               4 +
862           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
863            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
864               3;
865       const int32_t v = a * dgd[l] + b;
866       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
867     }
868   }
869 }
870 
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)871 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
872                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
873                                  int flt_stride, int sgr_params_idx,
874                                  int bit_depth, int highbd) {
875   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
876   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
877   int32_t *dgd32 =
878       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
879 
880   if (highbd) {
881     const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
882     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
883       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
884         dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
885       }
886     }
887   } else {
888     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
889       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
890         dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
891       }
892     }
893   }
894 
895   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
896   // If params->r == 0 we skip the corresponding filter. We only allow one of
897   // the radii to be 0, as having both equal to 0 would be equivalent to
898   // skipping SGR entirely.
899   assert(!(params->r[0] == 0 && params->r[1] == 0));
900 
901   if (params->r[0] > 0)
902     selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
903                                          flt0, flt_stride, bit_depth,
904                                          sgr_params_idx, 0);
905   if (params->r[1] > 0)
906     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
907                                     flt_stride, bit_depth, sgr_params_idx, 1);
908   return 0;
909 }
910 
av1_apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)911 void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
912                                         int height, int stride, int eps,
913                                         const int *xqd, uint8_t *dst8,
914                                         int dst_stride, int32_t *tmpbuf,
915                                         int bit_depth, int highbd) {
916   int32_t *flt0 = tmpbuf;
917   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
918   assert(width * height <= RESTORATION_UNITPELS_MAX);
919 
920   const int ret = av1_selfguided_restoration_c(
921       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
922   (void)ret;
923   assert(!ret);
924   const sgr_params_type *const params = &av1_sgr_params[eps];
925   int xq[2];
926   av1_decode_xq(xqd, xq, params);
927   for (int i = 0; i < height; ++i) {
928     for (int j = 0; j < width; ++j) {
929       const int k = i * width + j;
930       uint8_t *dst8ij = dst8 + i * dst_stride + j;
931       const uint8_t *dat8ij = dat8 + i * stride + j;
932 
933       const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
934       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
935       int32_t v = u << SGRPROJ_PRJ_BITS;
936       // If params->r == 0 then we skipped the filtering in
937       // av1_selfguided_restoration_c, i.e. flt[k] == u
938       if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
939       if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
940       const int16_t w =
941           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
942 
943       const uint16_t out = clip_pixel_highbd(w, bit_depth);
944       if (highbd)
945         *CONVERT_TO_SHORTPTR(dst8ij) = out;
946       else
947         *dst8ij = (uint8_t)out;
948     }
949   }
950 }
951 
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)952 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
953                                   int stripe_width, int stripe_height,
954                                   int procunit_width, const uint8_t *src,
955                                   int src_stride, uint8_t *dst, int dst_stride,
956                                   int32_t *tmpbuf, int bit_depth) {
957   (void)bit_depth;
958   assert(bit_depth == 8);
959 
960   for (int j = 0; j < stripe_width; j += procunit_width) {
961     int w = AOMMIN(procunit_width, stripe_width - j);
962     av1_apply_selfguided_restoration(
963         src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
964         rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
965   }
966 }
967 
968 #if CONFIG_AV1_HIGHBITDEPTH
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)969 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
970                                         int stripe_width, int stripe_height,
971                                         int procunit_width, const uint8_t *src8,
972                                         int src_stride, uint8_t *dst8,
973                                         int dst_stride, int32_t *tmpbuf,
974                                         int bit_depth) {
975   (void)tmpbuf;
976   const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
977 
978   for (int j = 0; j < stripe_width; j += procunit_width) {
979     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
980     const uint8_t *src8_p = src8 + j;
981     uint8_t *dst8_p = dst8 + j;
982     av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
983                                        rui->wiener_info.hfilter, 16,
984                                        rui->wiener_info.vfilter, 16, w,
985                                        stripe_height, &conv_params, bit_depth);
986   }
987 }
988 
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)989 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
990                                          int stripe_width, int stripe_height,
991                                          int procunit_width,
992                                          const uint8_t *src8, int src_stride,
993                                          uint8_t *dst8, int dst_stride,
994                                          int32_t *tmpbuf, int bit_depth) {
995   for (int j = 0; j < stripe_width; j += procunit_width) {
996     int w = AOMMIN(procunit_width, stripe_width - j);
997     av1_apply_selfguided_restoration(
998         src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
999         rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1000   }
1001 }
1002 #endif  // CONFIG_AV1_HIGHBITDEPTH
1003 
1004 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
1005                                   int stripe_width, int stripe_height,
1006                                   int procunit_width, const uint8_t *src,
1007                                   int src_stride, uint8_t *dst, int dst_stride,
1008                                   int32_t *tmpbuf, int bit_depth);
1009 
1010 #if CONFIG_AV1_HIGHBITDEPTH
1011 #define NUM_STRIPE_FILTERS 4
1012 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1013   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1014   sgrproj_filter_stripe_highbd
1015 };
1016 #else
1017 #define NUM_STRIPE_FILTERS 2
1018 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1019   wiener_filter_stripe, sgrproj_filter_stripe
1020 };
1021 #endif  // CONFIG_AV1_HIGHBITDEPTH
1022 
1023 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,const AV1PixelRect * tile_rect,int tile_stripe0,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr)1024 void av1_loop_restoration_filter_unit(
1025     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1026     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1027     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1028     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1029     int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1030   RestorationType unit_rtype = rui->restoration_type;
1031 
1032   int unit_h = limits->v_end - limits->v_start;
1033   int unit_w = limits->h_end - limits->h_start;
1034   uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1035   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1036 
1037   if (unit_rtype == RESTORE_NONE) {
1038     copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1039     return;
1040   }
1041 
1042   const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1043   assert(filter_idx < NUM_STRIPE_FILTERS);
1044   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1045 
1046   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1047 
1048   // Convolve the whole tile one stripe at a time
1049   RestorationTileLimits remaining_stripes = *limits;
1050   int i = 0;
1051   while (i < unit_h) {
1052     int copy_above, copy_below;
1053     remaining_stripes.v_start = limits->v_start + i;
1054 
1055     get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1056                              &copy_below);
1057 
1058     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1059     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1060 
1061     // Work out where this stripe's boundaries are within
1062     // rsb->stripe_boundary_{above,below}
1063     const int tile_stripe =
1064         (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1065         full_stripe_height;
1066     const int frame_stripe = tile_stripe0 + tile_stripe;
1067     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1068 
1069     // Calculate this stripe's height, based on two rules:
1070     // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1071     // * We can't extend past the end of the current restoration unit
1072     const int nominal_stripe_height =
1073         full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1074     const int h = AOMMIN(nominal_stripe_height,
1075                          remaining_stripes.v_end - remaining_stripes.v_start);
1076 
1077     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1078                                      h, data8, stride, rlbs, copy_above,
1079                                      copy_below, optimized_lr);
1080 
1081     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1082                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1083 
1084     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1085                                        data8, stride, copy_above, copy_below,
1086                                        optimized_lr);
1087 
1088     i += h;
1089   }
1090 }
1091 
filter_frame_on_unit(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1092 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1093                                  const AV1PixelRect *tile_rect,
1094                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
1095                                  RestorationLineBuffers *rlbs) {
1096   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1097   const RestorationInfo *rsi = ctxt->rsi;
1098 
1099   av1_loop_restoration_filter_unit(
1100       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1101       ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1102       ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1103       rsi->optimized_lr);
1104 }
1105 
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1106 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1107                                             YV12_BUFFER_CONFIG *frame,
1108                                             AV1_COMMON *cm, int optimized_lr,
1109                                             int num_planes) {
1110   const SequenceHeader *const seq_params = &cm->seq_params;
1111   const int bit_depth = seq_params->bit_depth;
1112   const int highbd = seq_params->use_highbitdepth;
1113   lr_ctxt->dst = &cm->rst_frame;
1114 
1115   const int frame_width = frame->crop_widths[0];
1116   const int frame_height = frame->crop_heights[0];
1117   if (aom_realloc_frame_buffer(
1118           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1119           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1120           cm->features.byte_alignment, NULL, NULL, NULL) < 0)
1121     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1122                        "Failed to allocate restoration dst buffer");
1123 
1124   lr_ctxt->on_rest_unit = filter_frame_on_unit;
1125   lr_ctxt->frame = frame;
1126   for (int plane = 0; plane < num_planes; ++plane) {
1127     RestorationInfo *rsi = &cm->rst_info[plane];
1128     RestorationType rtype = rsi->frame_restoration_type;
1129     rsi->optimized_lr = optimized_lr;
1130 
1131     if (rtype == RESTORE_NONE) {
1132       continue;
1133     }
1134 
1135     const int is_uv = plane > 0;
1136     const int plane_width = frame->crop_widths[is_uv];
1137     const int plane_height = frame->crop_heights[is_uv];
1138     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1139 
1140     av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
1141                      frame->strides[is_uv], RESTORATION_BORDER,
1142                      RESTORATION_BORDER, highbd);
1143 
1144     lr_plane_ctxt->rsi = rsi;
1145     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1146     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1147     lr_plane_ctxt->highbd = highbd;
1148     lr_plane_ctxt->bit_depth = bit_depth;
1149     lr_plane_ctxt->data8 = frame->buffers[plane];
1150     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1151     lr_plane_ctxt->data_stride = frame->strides[is_uv];
1152     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1153     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1154     lr_plane_ctxt->tile_stripe0 = 0;
1155   }
1156 }
1157 
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1158 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1159                                       AV1_COMMON *cm, int num_planes) {
1160   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1161                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1162                            int vstart, int vend);
1163   static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1164                                          aom_yv12_partial_coloc_copy_u,
1165                                          aom_yv12_partial_coloc_copy_v };
1166   assert(num_planes <= 3);
1167   for (int plane = 0; plane < num_planes; ++plane) {
1168     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1169     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1170     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1171                      tile_rect.right, tile_rect.top, tile_rect.bottom);
1172   }
1173 }
1174 
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1175 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1176                                         int num_planes) {
1177   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1178 
1179   for (int plane = 0; plane < num_planes; ++plane) {
1180     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1181       continue;
1182     }
1183 
1184     av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1185                                    &ctxt[plane], &ctxt[plane].tile_rect,
1186                                    cm->rst_tmpbuf, cm->rlbs);
1187   }
1188 }
1189 
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1190 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1191                                        AV1_COMMON *cm, int optimized_lr,
1192                                        void *lr_ctxt) {
1193   assert(!cm->features.all_lossless);
1194   const int num_planes = av1_num_planes(cm);
1195 
1196   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1197 
1198   av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1199                                          optimized_lr, num_planes);
1200 
1201   foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1202 
1203   av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1204 }
1205 
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,const AV1PixelRect * tile_rect,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int unit_idx0,int hunits_per_tile,int vunits_per_tile,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync)1206 void av1_foreach_rest_unit_in_row(
1207     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1208     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1209     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1210     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1211     sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1212     struct AV1LrSyncData *const lr_sync) {
1213   const int tile_w = tile_rect->right - tile_rect->left;
1214   const int ext_size = unit_size * 3 / 2;
1215   int x0 = 0, j = 0;
1216   while (x0 < tile_w) {
1217     int remaining_w = tile_w - x0;
1218     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1219 
1220     limits->h_start = tile_rect->left + x0;
1221     limits->h_end = tile_rect->left + x0 + w;
1222     assert(limits->h_end <= tile_rect->right);
1223 
1224     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1225 
1226     // No sync for even numbered rows
1227     // For odd numbered rows, Loop Restoration of current block requires the LR
1228     // of top-right and bottom-right blocks to be completed
1229 
1230     // top-right sync
1231     on_sync_read(lr_sync, row_number, j, plane);
1232     if ((row_number + 1) < vunits_per_tile)
1233       // bottom-right sync
1234       on_sync_read(lr_sync, row_number + 2, j, plane);
1235 
1236     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1237 
1238     on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1239 
1240     x0 += w;
1241     ++j;
1242   }
1243 }
1244 
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1245 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1246   (void)lr_sync;
1247   (void)r;
1248   (void)c;
1249   (void)plane;
1250 }
1251 
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1252 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1253                              const int sb_cols, int plane) {
1254   (void)lr_sync;
1255   (void)r;
1256   (void)c;
1257   (void)sb_cols;
1258   (void)plane;
1259 }
1260 
foreach_rest_unit_in_tile(const AV1PixelRect * tile_rect,int tile_row,int tile_col,int tile_cols,int hunits_per_tile,int vunits_per_tile,int units_per_tile,int unit_size,int ss_y,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1261 static void foreach_rest_unit_in_tile(
1262     const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1263     int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1264     int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1265     int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1266   const int tile_h = tile_rect->bottom - tile_rect->top;
1267   const int ext_size = unit_size * 3 / 2;
1268 
1269   const int tile_idx = tile_col + tile_row * tile_cols;
1270   const int unit_idx0 = tile_idx * units_per_tile;
1271 
1272   int y0 = 0, i = 0;
1273   while (y0 < tile_h) {
1274     int remaining_h = tile_h - y0;
1275     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1276 
1277     RestorationTileLimits limits;
1278     limits.v_start = tile_rect->top + y0;
1279     limits.v_end = tile_rect->top + y0 + h;
1280     assert(limits.v_end <= tile_rect->bottom);
1281     // Offset the tile upwards to align with the restoration processing stripe
1282     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1283     limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1284     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1285 
1286     av1_foreach_rest_unit_in_row(
1287         &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1288         hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1289         av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1290 
1291     y0 += h;
1292     ++i;
1293   }
1294 }
1295 
av1_foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,AV1PixelRect * tile_rect,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1296 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1297                                     rest_unit_visitor_t on_rest_unit,
1298                                     void *priv, AV1PixelRect *tile_rect,
1299                                     int32_t *tmpbuf,
1300                                     RestorationLineBuffers *rlbs) {
1301   const int is_uv = plane > 0;
1302   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1303 
1304   const RestorationInfo *rsi = &cm->rst_info[plane];
1305 
1306   foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1307                             rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1308                             rsi->units_per_tile, rsi->restoration_unit_size,
1309                             ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1310 }
1311 
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1312 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1313                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
1314                                        int *rcol0, int *rcol1, int *rrow0,
1315                                        int *rrow1) {
1316   assert(rcol0 && rcol1 && rrow0 && rrow1);
1317 
1318   if (bsize != cm->seq_params.sb_size) return 0;
1319   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1320 
1321   assert(!cm->features.all_lossless);
1322 
1323   const int is_uv = plane > 0;
1324 
1325   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1326   const int tile_w = tile_rect.right - tile_rect.left;
1327   const int tile_h = tile_rect.bottom - tile_rect.top;
1328 
1329   const int mi_top = 0;
1330   const int mi_left = 0;
1331 
1332   // Compute the mi-unit corners of the superblock relative to the top-left of
1333   // the tile
1334   const int mi_rel_row0 = mi_row - mi_top;
1335   const int mi_rel_col0 = mi_col - mi_left;
1336   const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1337   const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1338 
1339   const RestorationInfo *rsi = &cm->rst_info[plane];
1340   const int size = rsi->restoration_unit_size;
1341 
1342   // Calculate the number of restoration units in this tile (which might be
1343   // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1344   const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1345   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1346 
1347   // The size of an MI-unit on this plane of the image
1348   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1349   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1350   const int mi_size_x = MI_SIZE >> ss_x;
1351   const int mi_size_y = MI_SIZE >> ss_y;
1352 
1353   // Write m for the relative mi column or row, D for the superres denominator
1354   // and N for the superres numerator. If u is the upscaled pixel offset then
1355   // we can write the downscaled pixel offset in two ways as:
1356   //
1357   //   MI_SIZE * m = N / D u
1358   //
1359   // from which we get u = D * MI_SIZE * m / N
1360   const int mi_to_num_x = av1_superres_scaled(cm)
1361                               ? mi_size_x * cm->superres_scale_denominator
1362                               : mi_size_x;
1363   const int mi_to_num_y = mi_size_y;
1364   const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1365   const int denom_y = size;
1366 
1367   const int rnd_x = denom_x - 1;
1368   const int rnd_y = denom_y - 1;
1369 
1370   // rcol0/rrow0 should be the first column/row of restoration units (relative
1371   // to the top-left of the tile) that doesn't start left/below of
1372   // mi_col/mi_row. For this calculation, we need to round up the division (if
1373   // the sb starts at runit column 10.1, the first matching runit has column
1374   // index 11)
1375   *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1376   *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1377 
1378   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1379   // below-right. If we're at the bottom or right of the tile, this restoration
1380   // unit might not exist, in which case we'll clamp accordingly.
1381   *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1382   *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1383 
1384   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1385 }
1386 
1387 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1388 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1389                          int extend, int use_highbitdepth) {
1390   for (int i = 0; i < height; ++i) {
1391     if (use_highbitdepth) {
1392       uint16_t *buf16 = (uint16_t *)buf;
1393       aom_memset16(buf16 - extend, buf16[0], extend);
1394       aom_memset16(buf16 + width, buf16[width - 1], extend);
1395     } else {
1396       memset(buf - extend, buf[0], extend);
1397       memset(buf + width, buf[width - 1], extend);
1398     }
1399     buf += stride;
1400   }
1401 }
1402 
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1403 static void save_deblock_boundary_lines(
1404     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1405     int stripe, int use_highbd, int is_above,
1406     RestorationStripeBoundaries *boundaries) {
1407   const int is_uv = plane > 0;
1408   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1409   const int src_stride = frame->strides[is_uv] << use_highbd;
1410   const uint8_t *src_rows = src_buf + row * src_stride;
1411 
1412   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1413                                : boundaries->stripe_boundary_below;
1414   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1415   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1416   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1417 
1418   // There is a rare case in which a processing stripe can end 1px above the
1419   // crop border. In this case, we do want to use deblocked pixels from below
1420   // the stripe (hence why we ended up in this function), but instead of
1421   // fetching 2 "below" rows we need to fetch one and duplicate it.
1422   // This is equivalent to clamping the sample locations against the crop border
1423   const int lines_to_save =
1424       AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1425   assert(lines_to_save == 1 || lines_to_save == 2);
1426 
1427   int upscaled_width;
1428   int line_bytes;
1429   if (av1_superres_scaled(cm)) {
1430     const int ss_x = is_uv && cm->seq_params.subsampling_x;
1431     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1432     line_bytes = upscaled_width << use_highbd;
1433     if (use_highbd)
1434       av1_upscale_normative_rows(
1435           cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1436           CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1437           plane, lines_to_save);
1438     else
1439       av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1440                                  boundaries->stripe_boundary_stride, plane,
1441                                  lines_to_save);
1442   } else {
1443     upscaled_width = frame->crop_widths[is_uv];
1444     line_bytes = upscaled_width << use_highbd;
1445     for (int i = 0; i < lines_to_save; i++) {
1446       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1447              line_bytes);
1448     }
1449   }
1450   // If we only saved one line, then copy it into the second line buffer
1451   if (lines_to_save == 1)
1452     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1453 
1454   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1455                RESTORATION_EXTRA_HORZ, use_highbd);
1456 }
1457 
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1458 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1459                                      const AV1_COMMON *cm, int plane, int row,
1460                                      int stripe, int use_highbd, int is_above,
1461                                      RestorationStripeBoundaries *boundaries) {
1462   const int is_uv = plane > 0;
1463   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1464   const int src_stride = frame->strides[is_uv] << use_highbd;
1465   const uint8_t *src_rows = src_buf + row * src_stride;
1466 
1467   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1468                                : boundaries->stripe_boundary_below;
1469   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1470   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1471   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1472   const int src_width = frame->crop_widths[is_uv];
1473 
1474   // At the point where this function is called, we've already applied
1475   // superres. So we don't need to extend the lines here, we can just
1476   // pull directly from the topmost row of the upscaled frame.
1477   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1478   const int upscaled_width = av1_superres_scaled(cm)
1479                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
1480                                  : src_width;
1481   const int line_bytes = upscaled_width << use_highbd;
1482   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1483     // Copy the line at 'row' into both context lines. This is because
1484     // we want to (effectively) extend the outermost row of CDEF data
1485     // from this tile to produce a border, rather than using deblocked
1486     // pixels from the tile above/below.
1487     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1488   }
1489   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1490                RESTORATION_EXTRA_HORZ, use_highbd);
1491 }
1492 
save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1493 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1494                                          int use_highbd, int plane,
1495                                          AV1_COMMON *cm, int after_cdef) {
1496   const int is_uv = plane > 0;
1497   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1498   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1499   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1500 
1501   // Get the tile rectangle, with height rounded up to the next multiple of 8
1502   // luma pixels (only relevant for the bottom tile of the frame)
1503   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1504   const int stripe0 = 0;
1505 
1506   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1507 
1508   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1509 
1510   int tile_stripe;
1511   for (tile_stripe = 0;; ++tile_stripe) {
1512     const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1513     const int y0 = tile_rect.top + rel_y0;
1514     if (y0 >= tile_rect.bottom) break;
1515 
1516     const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1517     const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1518 
1519     const int frame_stripe = stripe0 + tile_stripe;
1520 
1521     // In this case, we should only use CDEF pixels at the top
1522     // and bottom of the frame as a whole; internal tile boundaries
1523     // can use deblocked pixels from adjacent tiles for context.
1524     const int use_deblock_above = (frame_stripe > 0);
1525     const int use_deblock_below = (y1 < plane_height);
1526 
1527     if (!after_cdef) {
1528       // Save deblocked context where needed.
1529       if (use_deblock_above) {
1530         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1531                                     frame_stripe, use_highbd, 1, boundaries);
1532       }
1533       if (use_deblock_below) {
1534         save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1535                                     use_highbd, 0, boundaries);
1536       }
1537     } else {
1538       // Save CDEF context where needed. Note that we need to save the CDEF
1539       // context for a particular boundary iff we *didn't* save deblocked
1540       // context for that boundary.
1541       //
1542       // In addition, we need to save copies of the outermost line within
1543       // the tile, rather than using data from outside the tile.
1544       if (!use_deblock_above) {
1545         save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1546                                  1, boundaries);
1547       }
1548       if (!use_deblock_below) {
1549         save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1550                                  use_highbd, 0, boundaries);
1551       }
1552     }
1553   }
1554 }
1555 
1556 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1557 // lines to be used as boundary in the loop restoration process. The
1558 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1559 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1560                                               AV1_COMMON *cm, int after_cdef) {
1561   const int num_planes = av1_num_planes(cm);
1562   const int use_highbd = cm->seq_params.use_highbitdepth;
1563   for (int p = 0; p < num_planes; ++p) {
1564     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1565   }
1566 }
1567