• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <math.h>
14 #include <stdbool.h>
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "av1/common/av1_common_int.h"
18 #include "av1/encoder/cnn.h"
19 
20 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
21 
22 typedef struct {
23   const float **input;
24   int in_width;
25   int in_height;
26   int in_stride;
27   const CNN_LAYER_CONFIG *layer_config;
28   float **output;
29   int out_stride;
30   int start_idx;
31   int th_step;
32 } CONVOLVE_OPS;
33 
softsign(float x)34 static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
35 
relu(float x)36 static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
37 
38 typedef struct {
39   int allocsize;
40   int channels;
41   int width, height, stride;
42   float *buf[CNN_MAX_CHANNELS];
43 } TENSOR;
44 
init_tensor(TENSOR * tensor)45 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
46 
free_tensor(TENSOR * tensor)47 static void free_tensor(TENSOR *tensor) {
48   if (tensor->allocsize) {
49     aom_free(tensor->buf[0]);
50     tensor->buf[0] = NULL;
51     tensor->allocsize = 0;
52   }
53 }
54 
realloc_tensor(TENSOR * tensor,int channels,int width,int height)55 static bool realloc_tensor(TENSOR *tensor, int channels, int width,
56                            int height) {
57   const int newallocsize = channels * width * height;
58   if (tensor->allocsize < newallocsize) {
59     free_tensor(tensor);
60     tensor->buf[0] =
61         (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
62     if (!tensor->buf[0]) return false;
63     tensor->allocsize = newallocsize;
64   }
65   tensor->width = width;
66   tensor->height = height;
67   tensor->stride = width;
68   tensor->channels = channels;
69   for (int c = 1; c < channels; ++c)
70     tensor->buf[c] = &tensor->buf[0][c * width * height];
71   return true;
72 }
73 
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)74 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
75                         TENSOR *dst) {
76   assert(src->width == dst->width);
77   assert(src->height == dst->height);
78   assert(copy_channels <= src->channels);
79   if (src->stride == dst->width && dst->stride == dst->width) {
80     for (int c = 0; c < copy_channels; ++c) {
81       memcpy(dst->buf[dst_offset + c], src->buf[c],
82              sizeof(*dst->buf[0]) * src->width * src->height);
83     }
84   } else {
85     for (int c = 0; c < copy_channels; ++c) {
86       for (int r = 0; r < dst->height; ++r) {
87         memcpy(&dst->buf[dst_offset + c][r * dst->stride],
88                &src->buf[c][r * src->stride],
89                dst->width * sizeof(*dst->buf[c]));
90       }
91     }
92   }
93 }
94 
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)95 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
96                           int channels, int width, int height, int stride) {
97   tensor->allocsize = 0;
98   tensor->channels = channels;
99   tensor->width = width;
100   tensor->height = height;
101   tensor->stride = stride;
102   if (buf) {
103     for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
104   } else {
105     for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
106   }
107 }
108 
swap_tensor(TENSOR * t1,TENSOR * t2)109 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
110   TENSOR t = *t1;
111   *t1 = *t2;
112   *t2 = t;
113 }
114 
115 // The concatenated tensor goes into dst with first the channels in
116 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)117 static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
118   assert(src->width == dst->width);
119   assert(src->height == dst->height);
120 
121   const int dst_channels = dst->channels;
122   const int channels = dst->channels + src->channels;
123   const int newallocsize = channels * dst->width * dst->height;
124   if (dst->allocsize < newallocsize) {
125     TENSOR t;
126     init_tensor(&t);
127     // allocate new buffers and copy first the dst channels
128     if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
129     copy_tensor(dst, dst->channels, 0, &t);
130     // Swap the tensors and free the old buffers
131     swap_tensor(dst, &t);
132     free_tensor(&t);
133   }
134   for (int c = 1; c < channels; ++c)
135     dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
136   // Copy the channels in src after the first dst_channels channels.
137   copy_tensor(src, src->channels, dst_channels, dst);
138   return true;
139 }
140 
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)141 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
142   return (t1->width == t2->width && t1->height == t2->height);
143 }
144 
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
146   return (t1->channels == t2->channels && t1->width == t2->width &&
147           t1->height == t2->height);
148 }
149 
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)150 void av1_find_cnn_layer_output_size(int in_width, int in_height,
151                                     const CNN_LAYER_CONFIG *layer_config,
152                                     int *out_width, int *out_height) {
153   assert(layer_config->skip_width > 0);
154   assert(layer_config->skip_height > 0);
155   if (!layer_config->deconvolve) {
156     switch (layer_config->pad) {
157       case PADDING_SAME_ZERO:
158       case PADDING_SAME_REPLICATE:
159         *out_width = (in_width + layer_config->skip_width - 1) /
160                      layer_config->skip_width;
161         *out_height = (in_height + layer_config->skip_height - 1) /
162                       layer_config->skip_height;
163         break;
164       case PADDING_VALID:
165         *out_width =
166             (in_width - layer_config->filter_width + layer_config->skip_width) /
167             layer_config->skip_width;
168         *out_height = (in_height - layer_config->filter_height +
169                        layer_config->skip_height) /
170                       layer_config->skip_height;
171         break;
172       default: assert(0 && "Unknown padding type");
173     }
174   } else {
175     switch (layer_config->pad) {
176       case PADDING_SAME_ZERO:
177       case PADDING_SAME_REPLICATE:
178         *out_width = in_width * layer_config->skip_width;
179         *out_height = in_height * layer_config->skip_height;
180         break;
181       case PADDING_VALID:
182         *out_width = (in_width - 1) * layer_config->skip_width +
183                      layer_config->filter_width;
184         *out_height = (in_height - 1) * layer_config->skip_height +
185                       layer_config->filter_height;
186         break;
187       default: assert(0 && "Unknown padding type");
188     }
189   }
190 }
191 
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])192 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
193                            int channels_per_branch[]) {
194   int branch = layer_config->branch;
195   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
196   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
197     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
198       if (layer_config->branch_copy_type == BRANCH_INPUT) {
199         channels_per_branch[b] = layer_config->in_channels;
200       } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
201         channels_per_branch[b] = layer_config->out_channels;
202       } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
203         channels_per_branch[b] = layer_config->out_channels;
204         for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
205           if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
206             assert(channels_per_branch[c] > 0);
207             channels_per_branch[b] += channels_per_branch[c];
208           }
209         }
210       }
211     }
212   }
213   channels_per_branch[branch] = layer_config->out_channels;
214   for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
215     if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
216       assert(channels_per_branch[c] > 0);
217       channels_per_branch[branch] += channels_per_branch[c];
218     }
219   }
220 }
221 
222 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)223 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
224   const int num_layers = cnn_config->num_layers;
225   const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
226 
227   for (int idx = 0; idx < num_layers; idx++) {
228     if (layer_configs[idx].output_num != -1) {
229       return 1;
230     }
231   }
232   return 0;
233 }
234 #endif
235 
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)236 void av1_find_cnn_output_size(int in_width, int in_height,
237                               const CNN_CONFIG *cnn_config, int *out_width,
238                               int *out_height, int *out_channels) {
239   int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
240   int i_width[CNN_MAX_BRANCHES] = { 0 };
241   int i_height[CNN_MAX_BRANCHES] = { 0 };
242   i_width[0] = in_width + cnn_config->ext_width * 2;
243   i_height[0] = in_height + cnn_config->ext_height * 2;
244 
245 #if CONFIG_DEBUG
246   assert(cnn_has_at_least_one_output(cnn_config));
247 #endif
248 
249   for (int i = 0; i < cnn_config->num_layers; ++i) {
250     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
251     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
252     const int branch = layer_config->branch;
253     int o_width = 0, o_height = 0;
254 
255     if (layer_config->branch_copy_type == BRANCH_INPUT) {
256       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
257         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
258           assert(i_width[branch] > 0 && i_height[branch] > 0);
259           i_width[b] = i_width[branch];
260           i_height[b] = i_height[branch];
261         }
262       }
263     }
264 
265     av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
266                                    layer_config, &o_width, &o_height);
267     i_width[branch] = o_width;
268     i_height[branch] = o_height;
269 
270     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
271       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
272         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
273           i_width[b] = o_width;
274           i_height[b] = o_height;
275         }
276       }
277     }
278 
279     find_cnn_out_channels(layer_config, channels_per_branch);
280 
281     const int output_num = layer_config->output_num;
282     if (output_num != -1) {  // Current layer is an output layer
283       out_width[output_num] = o_width;
284       out_height[output_num] = o_height;
285       out_channels[output_num] = channels_per_branch[layer_config->branch];
286     }
287   }
288 }
289 
get_start_shift_convolve(int width,int filt_width,int stride)290 static INLINE int get_start_shift_convolve(int width, int filt_width,
291                                            int stride) {
292   const int mod = (width % stride);
293   const int filt_off = (filt_width - 1) / 2;
294   const int dif = (mod ? mod - 1 : stride - 1);
295   return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
296 }
297 
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)298 void av1_cnn_add_c(float **output, int channels, int width, int height,
299                    int stride, const float **add) {
300   for (int c = 0; c < channels; ++c) {
301     for (int i = 0; i < height; ++i)
302       for (int j = 0; j < width; ++j)
303         output[c][i * stride + j] += add[c][i * stride + j];
304   }
305 }
306 
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)307 void av1_cnn_activate_c(float **output, int channels, int width, int height,
308                         int stride, ACTIVATION layer_activation) {
309   if (layer_activation == RELU) {
310     for (int c = 0; c < channels; ++c) {
311       for (int i = 0; i < height; ++i)
312         for (int j = 0; j < width; ++j)
313           output[c][i * stride + j] = relu(output[c][i * stride + j]);
314     }
315   } else if (layer_activation == SOFTSIGN) {
316     for (int c = 0; c < channels; ++c) {
317       for (int i = 0; i < height; ++i)
318         for (int j = 0; j < width; ++j)
319           output[c][i * stride + j] = softsign(output[c][i * stride + j]);
320     }
321   } else if (layer_activation == SIGMOID) {
322     assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
323   } else if (layer_activation != NONE) {
324     assert(0 && "Unknown activation type");
325   }
326 }
327 
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])328 static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
329                                            const CNN_LAYER_CONFIG *layer_config,
330                                            int branch, TENSOR branch_output[]) {
331   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
332   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
333     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
334       // Copy layer's active tensor to output tensor of branch b if set in
335       // mask. The output becomes the input of the first layer of the branch
336       // because the layer of the branch is not the first layer.
337       int copy_channels = branch_config->channels_to_copy > 0
338                               ? branch_config->channels_to_copy
339                               : layer_active_tensor->channels;
340       if (!realloc_tensor(&branch_output[b], copy_channels,
341                           layer_active_tensor->width,
342                           layer_active_tensor->height)) {
343         return false;
344       }
345       copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
346     }
347   }
348   return true;
349 }
350 
351 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
352 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)353 static void convolve_maxpool_padding_zero(
354     const float **input, int in_width, int in_height, int in_stride,
355     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
356     const int cstep, const int filter_width_half,
357     const int filter_height_half) {
358   for (int i = 0; i < layer_config->out_channels; ++i) {
359     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
360       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
361         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
362              ++hh) {
363           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
364                ++ww) {
365             float sum = layer_config->bias[i];
366             for (int k = 0; k < layer_config->in_channels; ++k) {
367               int off = k * layer_config->out_channels + i;
368               for (int l = 0; l < layer_config->filter_height; ++l) {
369                 const int ii = hh + l - filter_height_half;
370                 for (int m = 0; m < layer_config->filter_width;
371                      ++m, off += cstep) {
372                   const int jj = ww + m - filter_width_half;
373                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
374                     continue;
375                   sum += layer_config->weights[off] *
376                          input[k][ii * in_stride + jj];
377                 }
378               }
379             }
380             const float a = sum;
381             if (h == hh && w == ww)
382               output[i][u * out_stride + v] = a;
383             else
384               output[i][u * out_stride + v] =
385                   AOMMAX(output[i][u * out_stride + v], a);
386           }
387         }
388       }
389     }
390   }
391 }
392 
393 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
394 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)395 static void convolve_maxpool_padding_replicate(
396     const float **input, int in_width, int in_height, int in_stride,
397     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
398     const int cstep, const int filter_width_half,
399     const int filter_height_half) {
400   for (int i = 0; i < layer_config->out_channels; ++i) {
401     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
402       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
403         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
404              ++hh) {
405           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
406                ++ww) {
407             float sum = layer_config->bias[i];
408             for (int k = 0; k < layer_config->in_channels; ++k) {
409               int off = k * layer_config->out_channels + i;
410               for (int l = 0; l < layer_config->filter_height; ++l) {
411                 const int ii =
412                     CLAMPINDEX(hh + l - filter_height_half, in_height);
413                 for (int m = 0; m < layer_config->filter_width;
414                      ++m, off += cstep) {
415                   const int jj =
416                       CLAMPINDEX(ww + m - filter_width_half, in_width);
417                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
418                   sum += layer_config->weights[off] *
419                          input[k][ii * in_stride + jj];
420                 }
421               }
422             }
423             const float a = sum;
424             if (h == hh && w == ww)
425               output[i][u * out_stride + v] = a;
426             else
427               output[i][u * out_stride + v] =
428                   AOMMAX(output[i][u * out_stride + v], a);
429           }
430         }
431       }
432     }
433   }
434 }
435 
436 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
437 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)438 static void convolve_maxpool_padding_valid(
439     const float **input, int in_width, int in_height, int in_stride,
440     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
441     const int cstep) {
442   for (int i = 0; i < layer_config->out_channels; ++i) {
443     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
444          h += layer_config->skip_height, ++u) {
445       for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
446            w += layer_config->skip_width, ++v) {
447         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
448              ++hh) {
449           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
450                ++ww) {
451             float sum = layer_config->bias[i];
452             for (int k = 0; k < layer_config->in_channels; ++k) {
453               int off = k * layer_config->out_channels + i;
454               for (int l = 0; l < layer_config->filter_height; ++l) {
455                 const int ii = hh + l;
456                 for (int m = 0; m < layer_config->filter_width;
457                      ++m, off += cstep) {
458                   const int jj = ww + m;
459                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
460                   sum += layer_config->weights[off] *
461                          input[k][ii * in_stride + jj];
462                 }
463               }
464             }
465             const float a = sum;
466             if (h == hh && w == ww)
467               output[i][u * out_stride + v] = a;
468             else
469               output[i][u * out_stride + v] =
470                   AOMMAX(output[i][u * out_stride + v], a);
471           }
472         }
473       }
474     }
475   }
476 }
477 
478 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
479 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)480 static void convolve_element_wise(const float **input, int in_width,
481                                   int in_height, int in_stride,
482                                   const CNN_LAYER_CONFIG *const layer_config,
483                                   float **output, int out_stride, int start_idx,
484                                   int step) {
485   const int start_h = get_start_shift_convolve(
486       in_height, layer_config->filter_height, layer_config->skip_height);
487   const int start_w =
488       get_start_shift_convolve(in_width, layer_config->filter_width,
489                                layer_config->skip_width) +
490       start_idx * layer_config->skip_width;
491   const int out_w_step = AOMMAX(step, 1);
492   const int in_w_step = layer_config->skip_width * out_w_step;
493   for (int i = 0; i < layer_config->out_channels; ++i) {
494     for (int h = start_h, u = 0; h < in_height;
495          h += layer_config->skip_height, ++u) {
496       const int in_h = h * in_stride;
497       const int out_h = u * out_stride + start_idx;
498       for (int w = start_w, out_index = out_h; w < in_width;
499            w += in_w_step, out_index += out_w_step) {
500         float sum = layer_config->bias[i];
501         for (int k = 0; k < layer_config->in_channels; ++k) {
502           sum += layer_config->weights[k * layer_config->out_channels + i] *
503                  input[k][in_h + w];
504         }
505         output[i][out_index] = sum;
506       }
507     }
508   }
509 }
510 
511 // CNNConvolve specific to maxpool set as 0 and padding equal to
512 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)513 static void convolve_no_maxpool_padding_zero(
514     const float **input, int in_width, int in_height, int in_stride,
515     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
516     int start_idx, const int cstep, const int filter_width_half,
517     const int filter_height_half, const int ii_shift, const int jj_shift,
518     const int channel_step) {
519   const int start_h = get_start_shift_convolve(
520       in_height, layer_config->filter_height, layer_config->skip_height);
521   const int start_w = get_start_shift_convolve(
522       in_width, layer_config->filter_width, layer_config->skip_width);
523   const int end_ii_shift = filter_height_half + 1;
524   const int end_jj_shift = filter_width_half + 1;
525   // *_filter_margin stores the number of pixels along a dimension in the
526   // intersection of the complement of the image in the extended image
527   // and the filter.
528   const int top_filter_margin = layer_config->filter_width * ii_shift;
529   const int right_filter_margin = end_jj_shift - in_width;
530   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
531     for (int h = start_h, u = 0; h < in_height;
532          h += layer_config->skip_height, ++u) {
533       const int out_h = u * out_stride;
534       const int top_cstep =
535           AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
536               cstep +
537           i;
538       const int start_ii = AOMMAX(0, h - ii_shift);
539       const int end_ii = AOMMIN(in_height, h + end_ii_shift);
540       for (int w = start_w, out_index = out_h; w < in_width;
541            w += layer_config->skip_width, ++out_index) {
542         const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
543         const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
544         const int start_jj = AOMMAX(0, w - jj_shift);
545         const int end_jj = AOMMIN(in_width, w + end_jj_shift);
546         float sum = layer_config->bias[i];
547         for (int k = 0; k < layer_config->in_channels; ++k) {
548           int off = k * layer_config->out_channels + top_cstep;
549           for (int ii = start_ii; ii < end_ii; ++ii) {
550             off += left_cstep;
551             for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
552               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
553             }
554             off += right_cstep;
555           }
556         }
557         output[i][out_index] = sum;
558       }
559     }
560   }
561 }
562 
563 // CNNConvolve specific to maxpool set as 0 and padding equal to
564 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)565 static void convolve_no_maxpool_padding_replicate(
566     const float **input, int in_width, int in_height, int in_stride,
567     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
568     int start_idx, const int cstep, const int ii_shift, const int jj_shift,
569     const int channel_step) {
570   // h and w are shifted to an offset coordinate system to reduce in-loop
571   // computation.
572   const int start_h =
573       get_start_shift_convolve(in_height, layer_config->filter_height,
574                                layer_config->skip_height) -
575       ii_shift;
576   const int start_w =
577       get_start_shift_convolve(in_width, layer_config->filter_width,
578                                layer_config->skip_width) -
579       jj_shift;
580   const int end_h = in_height - ii_shift;
581   const int end_w = in_width - jj_shift;
582   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
583     for (int h = start_h, u = 0; h < end_h;
584          h += layer_config->skip_height, ++u) {
585       const int out_h = u * out_stride;
586       const int upper_ii_index = layer_config->filter_height + h;
587       for (int w = start_w, out_index = out_h; w < end_w;
588            w += layer_config->skip_width, ++out_index) {
589         const int upper_jj_index = layer_config->filter_width + w;
590         float sum = layer_config->bias[i];
591         for (int k = 0; k < layer_config->in_channels; ++k) {
592           int off = k * layer_config->out_channels + i;
593           for (int ii = h; ii < upper_ii_index; ++ii) {
594             const int clamped_ii = CLAMPINDEX(ii, in_height);
595             for (int jj = w; jj < upper_jj_index; ++jj) {
596               const int clamped_jj = CLAMPINDEX(jj, in_width);
597               assert(clamped_ii >= 0 && clamped_ii < in_height &&
598                      clamped_jj >= 0 && clamped_jj < in_width);
599               sum += layer_config->weights[off] *
600                      input[k][clamped_ii * in_stride + clamped_jj];
601               off += cstep;
602             }
603           }
604         }
605         output[i][out_index] = sum;
606       }
607     }
608   }
609 }
610 
611 // CNNConvolve specific to maxpool set as 0 and padding equal to
612 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)613 void av1_cnn_convolve_no_maxpool_padding_valid_c(
614     const float **input, int in_width, int in_height, int in_stride,
615     const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
616     int start_idx, int cstep, int channel_step) {
617   assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
618          !layer_config->maxpool);
619   assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
620   assert(layer_config->pad == PADDING_VALID);
621   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
622     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
623          h += layer_config->skip_height, ++u) {
624       const int out_h = u * out_stride;
625       const int upper_ii_index = layer_config->filter_height + h;
626       for (int w = 0, out_index = out_h;
627            w < in_width - layer_config->filter_width + 1;
628            w += layer_config->skip_width, ++out_index) {
629         const int upper_jj_index = layer_config->filter_width + w;
630         float sum = layer_config->bias[i];
631         for (int k = 0; k < layer_config->in_channels; ++k) {
632           int off = k * layer_config->out_channels + i;
633           for (int ii = h; ii < upper_ii_index; ++ii) {
634             for (int jj = w; jj < upper_jj_index; ++jj) {
635               assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
636               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
637               off += cstep;
638             }
639           }
640         }
641         output[i][out_index] = sum;
642       }
643     }
644   }
645 }
646 
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)647 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
648                              int in_stride,
649                              const CNN_LAYER_CONFIG *layer_config,
650                              float **output, int out_stride, int start_idx,
651                              int step) {
652   assert(!layer_config->deconvolve);
653   const int cstep = layer_config->in_channels * layer_config->out_channels;
654   const int filter_height_half = layer_config->filter_height >> 1;
655   const int filter_width_half = layer_config->filter_width >> 1;
656   const int channel_step = AOMMAX(step, 1);
657 
658   if (layer_config->maxpool &&
659       (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
660     switch (layer_config->pad) {
661       case PADDING_SAME_ZERO:
662         convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
663                                       layer_config, output, out_stride, cstep,
664                                       filter_width_half, filter_height_half);
665         break;
666       case PADDING_SAME_REPLICATE:
667         convolve_maxpool_padding_replicate(
668             input, in_width, in_height, in_stride, layer_config, output,
669             out_stride, cstep, filter_width_half, filter_height_half);
670         break;
671       case PADDING_VALID:
672         convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
673                                        layer_config, output, out_stride, cstep);
674         break;
675       default: assert(0 && "Unknown padding type");
676     }
677   } else {
678     // Results in element-wise matrix multiplication.
679     if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
680       convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
681                             output, out_stride, start_idx, step);
682       return;
683     }
684     const int ii_shift =
685         filter_height_half - (layer_config->filter_height - 1) % 2;
686     const int jj_shift =
687         filter_width_half - (layer_config->filter_width - 1) % 2;
688     switch (layer_config->pad) {
689       case PADDING_SAME_ZERO:
690         convolve_no_maxpool_padding_zero(
691             input, in_width, in_height, in_stride, layer_config, output,
692             out_stride, start_idx, cstep, filter_width_half, filter_height_half,
693             ii_shift, jj_shift, channel_step);
694         break;
695       case PADDING_SAME_REPLICATE:
696         convolve_no_maxpool_padding_replicate(
697             input, in_width, in_height, in_stride, layer_config, output,
698             out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
699         break;
700       case PADDING_VALID:
701         av1_cnn_convolve_no_maxpool_padding_valid(
702             input, in_width, in_height, in_stride, layer_config, output,
703             out_stride, start_idx, cstep, channel_step);
704         break;
705       default: assert(0 && "Unknown padding type");
706     }
707   }
708 }
709 
convolve_layer(void * arg1,void * arg2)710 static int convolve_layer(void *arg1, void *arg2) {
711   const CONVOLVE_OPS *convolve_ops = arg1;
712   (void)arg2;
713   av1_cnn_convolve(
714       convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
715       convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
716       convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
717   return 1;
718 }
719 
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)720 static void convolve_layer_mt(const float **input, int in_width, int in_height,
721                               int in_stride,
722                               const CNN_LAYER_CONFIG *layer_config,
723                               const CNN_THREAD_DATA *thread_data,
724                               float **output, int out_stride) {
725   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
726   const int num_workers = thread_data->num_workers;
727   assert(thread_data->workers);
728 
729   CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
730   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
731     AVxWorker *const worker = &thread_data->workers[th];
732     winterface->reset(worker);
733 
734     CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
735                                  in_stride,  layer_config, output,
736                                  out_stride, th,           num_workers };
737     convolve_ops[th] = convolve_op;
738     worker->hook = convolve_layer;
739     worker->data1 = &(convolve_ops[th]);
740     worker->data2 = NULL;
741 
742     // Start convolving.
743     if (th == num_workers - 1) {
744       winterface->execute(worker);
745     } else {
746       winterface->launch(worker);
747     }
748   }
749 
750   // Wait until all workers have finished.
751   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
752     winterface->sync(&thread_data->workers[th]);
753   }
754 }
755 
get_start_shift_deconvolve(int filt_width,int stride)756 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
757   const int dif = AOMMAX(filt_width - stride, 0);
758   return dif / 2;
759 }
760 
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)761 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
762                          int stride, const float *gamma, const float *beta,
763                          const float *mean, const float *std) {
764   assert(gamma && beta && beta && std && "batchnorm has null parameter!");
765   for (int ch = 0; ch < channels; ch++) {
766     const float ch_gamma = gamma[ch];
767     const float ch_beta = beta[ch];
768     const float ch_mean = mean[ch];
769     const float ch_std = std[ch];
770     float *image_row = image[ch];
771 
772     for (int row = 0; row < height; row++) {
773       for (int col = 0; col < width; col++) {
774         image_row[col] =
775             ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
776       }
777       image_row += stride;
778     }
779   }
780 }
781 
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)782 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
783                           int in_stride, const CNN_LAYER_CONFIG *layer_config,
784                           float **output, int out_stride) {
785   assert(layer_config->deconvolve);
786 
787   const int cstep = layer_config->in_channels * layer_config->out_channels;
788 
789   int out_width = 0;
790   int out_height = 0;
791   av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
792                                  &out_height);
793   switch (layer_config->pad) {
794     case PADDING_SAME_ZERO:
795       for (int i = 0; i < layer_config->out_channels; ++i) {
796         for (int u = 0; u < out_height; ++u) {
797           for (int v = 0; v < out_width; ++v) {
798             float sum = layer_config->bias[i];
799             for (int k = 0; k < layer_config->in_channels; ++k) {
800               int off = k * layer_config->out_channels + i;
801               for (int l = 0; l < layer_config->filter_height; ++l) {
802                 const int h =
803                     u - l +
804                     get_start_shift_deconvolve(layer_config->filter_height,
805                                                layer_config->skip_height);
806                 for (int m = 0; m < layer_config->filter_width;
807                      ++m, off += cstep) {
808                   const int w =
809                       v - m +
810                       get_start_shift_deconvolve(layer_config->filter_width,
811                                                  layer_config->skip_width);
812                   if ((h % layer_config->skip_height) != 0 ||
813                       (w % layer_config->skip_width) != 0)
814                     continue;
815                   const int ii = h / layer_config->skip_height;
816                   const int jj = w / layer_config->skip_width;
817                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
818                     continue;
819                   sum += layer_config->weights[off] *
820                          input[k][ii * in_stride + jj];
821                 }
822               }
823             }
824             output[i][u * out_stride + v] = sum;
825           }
826         }
827       }
828       break;
829     case PADDING_SAME_REPLICATE:
830       for (int i = 0; i < layer_config->out_channels; ++i) {
831         for (int u = 0; u < out_height; ++u) {
832           for (int v = 0; v < out_width; ++v) {
833             float sum = layer_config->bias[i];
834             for (int k = 0; k < layer_config->in_channels; ++k) {
835               int off = k * layer_config->out_channels + i;
836               for (int l = 0; l < layer_config->filter_height; ++l) {
837                 const int h =
838                     u - l +
839                     get_start_shift_deconvolve(layer_config->filter_height,
840                                                layer_config->skip_height);
841                 for (int m = 0; m < layer_config->filter_width;
842                      ++m, off += cstep) {
843                   const int w =
844                       v - m +
845                       get_start_shift_deconvolve(layer_config->filter_width,
846                                                  layer_config->skip_width);
847                   if ((h % layer_config->skip_height) != 0 ||
848                       (w % layer_config->skip_width) != 0)
849                     continue;
850                   const int ii =
851                       CLAMPINDEX(h / layer_config->skip_height, in_height);
852                   const int jj =
853                       CLAMPINDEX(w / layer_config->skip_width, in_width);
854                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
855                   sum += layer_config->weights[off] *
856                          input[k][ii * in_stride + jj];
857                 }
858               }
859             }
860             output[i][u * out_stride + v] = sum;
861           }
862         }
863       }
864       break;
865     case PADDING_VALID:
866       for (int i = 0; i < layer_config->out_channels; ++i) {
867         for (int u = 0; u < out_height; ++u) {
868           for (int v = 0; v < out_width; ++v) {
869             float sum = layer_config->bias[i];
870             for (int k = 0; k < layer_config->in_channels; ++k) {
871               int off = k * layer_config->out_channels + i;
872               for (int l = 0; l < layer_config->filter_height; ++l) {
873                 const int h = u - l;
874                 for (int m = 0; m < layer_config->filter_width;
875                      ++m, off += cstep) {
876                   const int w = v - m;
877                   if ((h % layer_config->skip_height) != 0 ||
878                       (w % layer_config->skip_width) != 0)
879                     continue;
880                   const int ii = h / layer_config->skip_height;
881                   const int jj = w / layer_config->skip_width;
882                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
883                     continue;
884                   sum += layer_config->weights[off] *
885                          input[k][ii * in_stride + jj];
886                 }
887               }
888             }
889             output[i][u * out_stride + v] = sum;
890           }
891         }
892       }
893       break;
894     default: assert(0 && "Unknown padding type");
895   }
896 }
897 
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)898 bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
899                        int in_stride, const CNN_CONFIG *cnn_config,
900                        const CNN_THREAD_DATA *thread_data,
901                        CNN_MULTI_OUT *output_struct) {
902   bool success = false;
903   TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
904   TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
905 
906   float **output[CNN_MAX_BRANCHES];
907   const int *out_chs = output_struct->output_channels;
908   output[0] = output_struct->output_buffer;
909   for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
910     output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
911   }
912 
913   int i_width = in_width;
914   int i_height = in_height;
915   int o_width = 0, o_height = 0;
916   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
917     init_tensor(&tensor1[b]);
918     init_tensor(&tensor2[b]);
919   }
920 
921   const int *out_stride = output_struct->output_strides;
922   for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
923     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
924     const int branch = layer_config->branch;
925     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
926 
927     // Allocate input tensor
928     if (layer == 0) {       // First layer
929       assert(branch == 0);  // First layer must be primary branch
930       assign_tensor(&tensor1[branch], (float **)input,
931                     layer_config->in_channels, in_width, in_height, in_stride);
932     } else {  // Non-first layer
933       // Swap tensor1 and tensor2
934       swap_tensor(&tensor1[branch], &tensor2[branch]);
935 
936       i_width = tensor1[branch].width;
937       i_height = tensor1[branch].height;
938     }
939 
940     // Allocate output tensor
941     av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
942                                    &o_height);
943     const int output_num = layer_config->output_num;
944     if (output_num == -1) {  // Non-output layer
945       if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
946                           o_height)) {
947         goto Error;
948       }
949     } else {  // Output layer
950       free_tensor(&tensor2[branch]);
951       assign_tensor(&tensor2[branch], output[output_num],
952                     layer_config->out_channels, o_width, o_height,
953                     out_stride[output_num]);
954     }
955 
956     // If we are combining branches make sure that the branch to combine
957     // is different from the current branch.
958     assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
959                    !(branch_config->branches_to_combine & (1 << branch))));
960 
961     if (layer_config->branch_copy_type == BRANCH_INPUT) {
962       if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
963                                           branch, tensor2)) {
964         goto Error;
965       }
966     }
967     // Check consistency of input and output channels
968     assert(tensor1[branch].channels == layer_config->in_channels);
969     assert(tensor2[branch].channels == layer_config->out_channels);
970 
971     // Convolve/Deconvolve
972     if (!cnn_config->layer_config[layer].deconvolve) {
973       if (thread_data->num_workers > 1) {
974         convolve_layer_mt((const float **)tensor1[branch].buf,
975                           tensor1[branch].width, tensor1[branch].height,
976                           tensor1[branch].stride, layer_config, thread_data,
977                           tensor2[branch].buf, tensor2[branch].stride);
978       } else {
979         av1_cnn_convolve((const float **)tensor1[branch].buf,
980                          tensor1[branch].width, tensor1[branch].height,
981                          tensor1[branch].stride, layer_config,
982                          tensor2[branch].buf, tensor2[branch].stride, 0, 1);
983       }
984     } else {
985       av1_cnn_deconvolve((const float **)tensor1[branch].buf,
986                          tensor1[branch].width, tensor1[branch].height,
987                          tensor1[branch].stride, layer_config,
988                          tensor2[branch].buf, tensor2[branch].stride);
989     }
990 
991     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
992       if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
993                                           branch, tensor2)) {
994         goto Error;
995       }
996     }
997 
998     // Add tensors from other branches if needed
999     if (layer_config->branch_combine_type == BRANCH_ADD) {
1000       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1001         if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1002           assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
1003           av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
1004                       tensor2[branch].width, tensor2[branch].height,
1005                       tensor2[branch].stride, (const float **)tensor2[b].buf);
1006         }
1007       }
1008     }
1009 
1010     // Non-linearity
1011     av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1012                      tensor2[branch].width, tensor2[branch].height,
1013                      tensor2[branch].stride, layer_config->activation);
1014 
1015     if (layer_config->bn_params.bn_gamma) {
1016       av1_cnn_batchnorm(
1017           tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1018           tensor2[branch].height, tensor2[branch].stride,
1019           layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1020           layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1021     }
1022 
1023     // Concatenate tensors
1024     if (layer_config->branch_combine_type == BRANCH_CAT) {
1025       if (output_num == -1) {  // Non-output layer
1026         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1027           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1028             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1029             assert(tensor2[b].channels > 0);
1030             if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
1031           }
1032         }
1033       } else {  // Output layer
1034         const int existing_channels = tensor2[branch].channels;
1035         int num_chs = existing_channels;
1036         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1037           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1038             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1039             // Needed only to assign the new channel buffers
1040             num_chs += tensor2[b].channels;
1041           }
1042         }
1043         assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1044                       o_height, out_stride[output_num]);
1045 
1046         num_chs = existing_channels;
1047         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1048           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1049             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1050             // Needed only to assign the new channel buffers
1051             copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1052                         &tensor2[branch]);
1053             num_chs += tensor2[b].channels;
1054           }
1055         }
1056       }
1057     }
1058 
1059     if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1060       if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
1061                                           branch, tensor2)) {
1062         goto Error;
1063       }
1064     }
1065   }
1066 
1067   success = true;
1068 Error:
1069   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1070     free_tensor(&tensor1[b]);
1071     free_tensor(&tensor2[b]);
1072   }
1073   return success;
1074 }
1075 
1076 // Assume output already has proper allocation
1077 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1078 bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1079                                    int stride, const CNN_CONFIG *cnn_config,
1080                                    const CNN_THREAD_DATA *thread_data,
1081                                    CNN_MULTI_OUT *output) {
1082   const float max_val = 255.0;
1083 
1084   const int in_width = width + 2 * cnn_config->ext_width;
1085   const int in_height = height + 2 * cnn_config->ext_height;
1086   const int in_channels = cnn_config->layer_config[0].in_channels;
1087   float *inputs[CNN_MAX_CHANNELS];
1088   float *input_ =
1089       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1090   if (!input_) return false;
1091   const int in_stride = in_width;
1092 
1093   for (int c = 0; c < in_channels; ++c) {
1094     inputs[c] = input_ + c * in_stride * in_height;
1095     float *input =
1096         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1097 
1098     if (cnn_config->strict_bounds) {
1099       for (int i = 0; i < height; ++i)
1100         for (int j = 0; j < width; ++j)
1101           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1102       // extend left and right
1103       for (int i = 0; i < height; ++i) {
1104         for (int j = -cnn_config->ext_width; j < 0; ++j)
1105           input[i * in_stride + j] = input[i * in_stride];
1106         for (int j = width; j < width + cnn_config->ext_width; ++j)
1107           input[i * in_stride + j] = input[i * in_stride + width - 1];
1108       }
1109       // extend top and bottom
1110       for (int i = -cnn_config->ext_height; i < 0; ++i)
1111         memcpy(&input[i * in_stride - cnn_config->ext_width],
1112                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1113       for (int i = height; i < height + cnn_config->ext_height; ++i)
1114         memcpy(&input[i * in_stride - cnn_config->ext_width],
1115                &input[(height - 1) * in_stride - cnn_config->ext_width],
1116                in_width * sizeof(*input));
1117     } else {
1118       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1119            ++i)
1120         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1121              ++j)
1122           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1123     }
1124   }
1125   bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1126                                  in_stride, cnn_config, thread_data, output);
1127 
1128   aom_free(input_);
1129   return success;
1130 }
1131 
1132 // Assume output already has proper allocation
1133 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1134 bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1135                                           int stride,
1136                                           const CNN_CONFIG *cnn_config,
1137                                           const CNN_THREAD_DATA *thread_data,
1138                                           int bit_depth,
1139                                           CNN_MULTI_OUT *output) {
1140   const float max_val = (float)((1 << bit_depth) - 1);
1141 
1142   const int in_width = width + 2 * cnn_config->ext_width;
1143   const int in_height = height + 2 * cnn_config->ext_height;
1144   const int in_channels = cnn_config->layer_config[0].in_channels;
1145   float *inputs[CNN_MAX_CHANNELS];
1146   float *input_ =
1147       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1148   if (!input_) return false;
1149   const int in_stride = in_width;
1150 
1151   for (int c = 0; c < in_channels; ++c) {
1152     inputs[c] = input_ + c * in_stride * in_height;
1153     float *input =
1154         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1155 
1156     if (cnn_config->strict_bounds) {
1157       for (int i = 0; i < height; ++i)
1158         for (int j = 0; j < width; ++j)
1159           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1160       // extend left and right
1161       for (int i = 0; i < height; ++i) {
1162         for (int j = -cnn_config->ext_width; j < 0; ++j)
1163           input[i * in_stride + j] = input[i * in_stride];
1164         for (int j = width; j < width + cnn_config->ext_width; ++j)
1165           input[i * in_stride + j] = input[i * in_stride + width - 1];
1166       }
1167       // extend top and bottom
1168       for (int i = -cnn_config->ext_height; i < 0; ++i)
1169         memcpy(&input[i * in_stride - cnn_config->ext_width],
1170                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1171       for (int i = height; i < height + cnn_config->ext_height; ++i)
1172         memcpy(&input[i * in_stride - cnn_config->ext_width],
1173                &input[(height - 1) * in_stride - cnn_config->ext_width],
1174                in_width * sizeof(*input));
1175     } else {
1176       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1177            ++i)
1178         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1179              ++j)
1180           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1181     }
1182   }
1183 
1184   bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1185                                  in_stride, cnn_config, thread_data, output);
1186 
1187   aom_free(input_);
1188   return success;
1189 }
1190