• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <math.h>
14 #include <stdbool.h>
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "av1/common/av1_common_int.h"
18 #include "av1/encoder/cnn.h"
19 
20 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
21 
22 typedef struct {
23   const float **input;
24   int in_width;
25   int in_height;
26   int in_stride;
27   const CNN_LAYER_CONFIG *layer_config;
28   float **output;
29   int out_stride;
30   int start_idx;
31   int th_step;
32 } CONVOLVE_OPS;
33 
34 typedef float (*activation_fn)(float);
35 
softsign(float x)36 static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
37 
relu(float x)38 static float relu(float x) { return (x < 0) ? 0 : x; }
39 
identity(float x)40 static float identity(float x) { return x; }
41 
42 typedef struct {
43   int allocsize;
44   int channels;
45   int width, height, stride;
46   float *buf[CNN_MAX_CHANNELS];
47 } TENSOR;
48 
init_tensor(TENSOR * tensor)49 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
50 
free_tensor(TENSOR * tensor)51 static void free_tensor(TENSOR *tensor) {
52   if (tensor->allocsize) {
53     aom_free(tensor->buf[0]);
54     tensor->buf[0] = NULL;
55     tensor->allocsize = 0;
56   }
57 }
58 
realloc_tensor(TENSOR * tensor,int channels,int width,int height)59 static bool realloc_tensor(TENSOR *tensor, int channels, int width,
60                            int height) {
61   const int newallocsize = channels * width * height;
62   if (tensor->allocsize < newallocsize) {
63     free_tensor(tensor);
64     tensor->buf[0] =
65         (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
66     if (!tensor->buf[0]) return false;
67     tensor->allocsize = newallocsize;
68   }
69   tensor->width = width;
70   tensor->height = height;
71   tensor->stride = width;
72   tensor->channels = channels;
73   for (int c = 1; c < channels; ++c)
74     tensor->buf[c] = &tensor->buf[0][c * width * height];
75   return true;
76 }
77 
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)78 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
79                         TENSOR *dst) {
80   assert(src->width == dst->width);
81   assert(src->height == dst->height);
82   assert(copy_channels <= src->channels);
83   if (src->stride == dst->width && dst->stride == dst->width) {
84     for (int c = 0; c < copy_channels; ++c) {
85       memcpy(dst->buf[dst_offset + c], src->buf[c],
86              sizeof(*dst->buf[0]) * src->width * src->height);
87     }
88   } else {
89     for (int c = 0; c < copy_channels; ++c) {
90       for (int r = 0; r < dst->height; ++r) {
91         memcpy(&dst->buf[dst_offset + c][r * dst->stride],
92                &src->buf[c][r * src->stride],
93                dst->width * sizeof(*dst->buf[c]));
94       }
95     }
96   }
97 }
98 
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)99 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
100                           int channels, int width, int height, int stride) {
101   tensor->allocsize = 0;
102   tensor->channels = channels;
103   tensor->width = width;
104   tensor->height = height;
105   tensor->stride = stride;
106   if (buf) {
107     for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
108   } else {
109     for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
110   }
111 }
112 
swap_tensor(TENSOR * t1,TENSOR * t2)113 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
114   TENSOR t = *t1;
115   *t1 = *t2;
116   *t2 = t;
117 }
118 
119 // The concatenated tensor goes into dst with first the channels in
120 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)121 static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
122   assert(src->width == dst->width);
123   assert(src->height == dst->height);
124 
125   const int dst_channels = dst->channels;
126   const int channels = dst->channels + src->channels;
127   const int newallocsize = channels * dst->width * dst->height;
128   if (dst->allocsize < newallocsize) {
129     TENSOR t;
130     init_tensor(&t);
131     // allocate new buffers and copy first the dst channels
132     if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
133     copy_tensor(dst, dst->channels, 0, &t);
134     // Swap the tensors and free the old buffers
135     swap_tensor(dst, &t);
136     free_tensor(&t);
137   }
138   for (int c = 1; c < channels; ++c)
139     dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
140   // Copy the channels in src after the first dst_channels channels.
141   copy_tensor(src, src->channels, dst_channels, dst);
142   return true;
143 }
144 
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
146   return (t1->width == t2->width && t1->height == t2->height);
147 }
148 
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)149 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
150   return (t1->channels == t2->channels && t1->width == t2->width &&
151           t1->height == t2->height);
152 }
153 
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)154 void av1_find_cnn_layer_output_size(int in_width, int in_height,
155                                     const CNN_LAYER_CONFIG *layer_config,
156                                     int *out_width, int *out_height) {
157   assert(layer_config->skip_width > 0);
158   assert(layer_config->skip_height > 0);
159   if (!layer_config->deconvolve) {
160     switch (layer_config->pad) {
161       case PADDING_SAME_ZERO:
162       case PADDING_SAME_REPLICATE:
163         *out_width = (in_width + layer_config->skip_width - 1) /
164                      layer_config->skip_width;
165         *out_height = (in_height + layer_config->skip_height - 1) /
166                       layer_config->skip_height;
167         break;
168       case PADDING_VALID:
169         *out_width =
170             (in_width - layer_config->filter_width + layer_config->skip_width) /
171             layer_config->skip_width;
172         *out_height = (in_height - layer_config->filter_height +
173                        layer_config->skip_height) /
174                       layer_config->skip_height;
175         break;
176       default: assert(0 && "Unknown padding type");
177     }
178   } else {
179     switch (layer_config->pad) {
180       case PADDING_SAME_ZERO:
181       case PADDING_SAME_REPLICATE:
182         *out_width = in_width * layer_config->skip_width;
183         *out_height = in_height * layer_config->skip_height;
184         break;
185       case PADDING_VALID:
186         *out_width = (in_width - 1) * layer_config->skip_width +
187                      layer_config->filter_width;
188         *out_height = (in_height - 1) * layer_config->skip_height +
189                       layer_config->filter_height;
190         break;
191       default: assert(0 && "Unknown padding type");
192     }
193   }
194 }
195 
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])196 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
197                            int channels_per_branch[]) {
198   int branch = layer_config->branch;
199   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
200   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
201     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
202       if (layer_config->branch_copy_type == BRANCH_INPUT) {
203         channels_per_branch[b] = layer_config->in_channels;
204       } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
205         channels_per_branch[b] = layer_config->out_channels;
206       } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
207         channels_per_branch[b] = layer_config->out_channels;
208         for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
209           if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
210             assert(channels_per_branch[c] > 0);
211             channels_per_branch[b] += channels_per_branch[c];
212           }
213         }
214       }
215     }
216   }
217   channels_per_branch[branch] = layer_config->out_channels;
218   for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
219     if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
220       assert(channels_per_branch[c] > 0);
221       channels_per_branch[branch] += channels_per_branch[c];
222     }
223   }
224 }
225 
226 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)227 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
228   const int num_layers = cnn_config->num_layers;
229   const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
230 
231   for (int idx = 0; idx < num_layers; idx++) {
232     if (layer_configs[idx].output_num != -1) {
233       return 1;
234     }
235   }
236   return 0;
237 }
238 #endif
239 
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)240 void av1_find_cnn_output_size(int in_width, int in_height,
241                               const CNN_CONFIG *cnn_config, int *out_width,
242                               int *out_height, int *out_channels) {
243   int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
244   int i_width[CNN_MAX_BRANCHES] = { 0 };
245   int i_height[CNN_MAX_BRANCHES] = { 0 };
246   i_width[0] = in_width + cnn_config->ext_width * 2;
247   i_height[0] = in_height + cnn_config->ext_height * 2;
248 
249 #if CONFIG_DEBUG
250   assert(cnn_has_at_least_one_output(cnn_config));
251 #endif
252 
253   for (int i = 0; i < cnn_config->num_layers; ++i) {
254     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
255     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
256     const int branch = layer_config->branch;
257     int o_width = 0, o_height = 0;
258 
259     if (layer_config->branch_copy_type == BRANCH_INPUT) {
260       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
261         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
262           assert(i_width[branch] > 0 && i_height[branch] > 0);
263           i_width[b] = i_width[branch];
264           i_height[b] = i_height[branch];
265         }
266       }
267     }
268 
269     av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
270                                    layer_config, &o_width, &o_height);
271     i_width[branch] = o_width;
272     i_height[branch] = o_height;
273 
274     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
275       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
276         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
277           i_width[b] = o_width;
278           i_height[b] = o_height;
279         }
280       }
281     }
282 
283     find_cnn_out_channels(layer_config, channels_per_branch);
284 
285     const int output_num = layer_config->output_num;
286     if (output_num != -1) {  // Current layer is an output layer
287       out_width[output_num] = o_width;
288       out_height[output_num] = o_height;
289       out_channels[output_num] = channels_per_branch[layer_config->branch];
290     }
291   }
292 }
293 
get_activation(ACTIVATION layer_activation)294 activation_fn get_activation(ACTIVATION layer_activation) {
295   switch (layer_activation) {
296     case NONE: return identity;
297     case RELU: return relu;
298     case SOFTSIGN: return softsign;
299     case SIGMOID:
300       assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
301       return NULL;
302     default: assert(0 && "Unknown activation type"); return NULL;
303   }
304 }
305 
get_start_shift_convolve(int width,int filt_width,int stride)306 static INLINE int get_start_shift_convolve(int width, int filt_width,
307                                            int stride) {
308   const int mod = (width % stride);
309   const int filt_off = (filt_width - 1) / 2;
310   const int dif = (mod ? mod - 1 : stride - 1);
311   return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
312 }
313 
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)314 void av1_cnn_add_c(float **output, int channels, int width, int height,
315                    int stride, const float **add) {
316   for (int c = 0; c < channels; ++c) {
317     for (int i = 0; i < height; ++i)
318       for (int j = 0; j < width; ++j)
319         output[c][i * stride + j] += add[c][i * stride + j];
320   }
321 }
322 
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)323 void av1_cnn_activate_c(float **output, int channels, int width, int height,
324                         int stride, ACTIVATION layer_activation) {
325   activation_fn activation = get_activation(layer_activation);
326   for (int c = 0; c < channels; ++c) {
327     for (int i = 0; i < height; ++i)
328       for (int j = 0; j < width; ++j)
329         output[c][i * stride + j] = activation(output[c][i * stride + j]);
330   }
331 }
332 
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])333 static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
334                                            const CNN_LAYER_CONFIG *layer_config,
335                                            int branch, TENSOR branch_output[]) {
336   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
337   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
338     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
339       // Copy layer's active tensor to output tensor of branch b if set in
340       // mask. The output becomes the input of the first layer of the branch
341       // because the layer of the branch is not the first layer.
342       int copy_channels = branch_config->channels_to_copy > 0
343                               ? branch_config->channels_to_copy
344                               : layer_active_tensor->channels;
345       if (!realloc_tensor(&branch_output[b], copy_channels,
346                           layer_active_tensor->width,
347                           layer_active_tensor->height)) {
348         return false;
349       }
350       copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
351     }
352   }
353   return true;
354 }
355 
356 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
357 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)358 static void convolve_maxpool_padding_zero(
359     const float **input, int in_width, int in_height, int in_stride,
360     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
361     const int cstep, const int filter_width_half,
362     const int filter_height_half) {
363   for (int i = 0; i < layer_config->out_channels; ++i) {
364     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
365       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
366         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
367              ++hh) {
368           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
369                ++ww) {
370             float sum = layer_config->bias[i];
371             for (int k = 0; k < layer_config->in_channels; ++k) {
372               int off = k * layer_config->out_channels + i;
373               for (int l = 0; l < layer_config->filter_height; ++l) {
374                 const int ii = hh + l - filter_height_half;
375                 for (int m = 0; m < layer_config->filter_width;
376                      ++m, off += cstep) {
377                   const int jj = ww + m - filter_width_half;
378                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
379                     continue;
380                   sum += layer_config->weights[off] *
381                          input[k][ii * in_stride + jj];
382                 }
383               }
384             }
385             const float a = sum;
386             if (h == hh && w == ww)
387               output[i][u * out_stride + v] = a;
388             else
389               output[i][u * out_stride + v] =
390                   AOMMAX(output[i][u * out_stride + v], a);
391           }
392         }
393       }
394     }
395   }
396 }
397 
398 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
399 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)400 static void convolve_maxpool_padding_replicate(
401     const float **input, int in_width, int in_height, int in_stride,
402     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
403     const int cstep, const int filter_width_half,
404     const int filter_height_half) {
405   for (int i = 0; i < layer_config->out_channels; ++i) {
406     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
407       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
408         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
409              ++hh) {
410           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
411                ++ww) {
412             float sum = layer_config->bias[i];
413             for (int k = 0; k < layer_config->in_channels; ++k) {
414               int off = k * layer_config->out_channels + i;
415               for (int l = 0; l < layer_config->filter_height; ++l) {
416                 const int ii =
417                     CLAMPINDEX(hh + l - filter_height_half, in_height);
418                 for (int m = 0; m < layer_config->filter_width;
419                      ++m, off += cstep) {
420                   const int jj =
421                       CLAMPINDEX(ww + m - filter_width_half, in_width);
422                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
423                   sum += layer_config->weights[off] *
424                          input[k][ii * in_stride + jj];
425                 }
426               }
427             }
428             const float a = sum;
429             if (h == hh && w == ww)
430               output[i][u * out_stride + v] = a;
431             else
432               output[i][u * out_stride + v] =
433                   AOMMAX(output[i][u * out_stride + v], a);
434           }
435         }
436       }
437     }
438   }
439 }
440 
441 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
442 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)443 static void convolve_maxpool_padding_valid(
444     const float **input, int in_width, int in_height, int in_stride,
445     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
446     const int cstep) {
447   for (int i = 0; i < layer_config->out_channels; ++i) {
448     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
449          h += layer_config->skip_height, ++u) {
450       for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
451            w += layer_config->skip_width, ++v) {
452         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
453              ++hh) {
454           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
455                ++ww) {
456             float sum = layer_config->bias[i];
457             for (int k = 0; k < layer_config->in_channels; ++k) {
458               int off = k * layer_config->out_channels + i;
459               for (int l = 0; l < layer_config->filter_height; ++l) {
460                 const int ii = hh + l;
461                 for (int m = 0; m < layer_config->filter_width;
462                      ++m, off += cstep) {
463                   const int jj = ww + m;
464                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
465                   sum += layer_config->weights[off] *
466                          input[k][ii * in_stride + jj];
467                 }
468               }
469             }
470             const float a = sum;
471             if (h == hh && w == ww)
472               output[i][u * out_stride + v] = a;
473             else
474               output[i][u * out_stride + v] =
475                   AOMMAX(output[i][u * out_stride + v], a);
476           }
477         }
478       }
479     }
480   }
481 }
482 
483 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
484 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)485 static void convolve_element_wise(const float **input, int in_width,
486                                   int in_height, int in_stride,
487                                   const CNN_LAYER_CONFIG *const layer_config,
488                                   float **output, int out_stride, int start_idx,
489                                   int step) {
490   const int start_h = get_start_shift_convolve(
491       in_height, layer_config->filter_height, layer_config->skip_height);
492   const int start_w =
493       get_start_shift_convolve(in_width, layer_config->filter_width,
494                                layer_config->skip_width) +
495       start_idx * layer_config->skip_width;
496   const int out_w_step = AOMMAX(step, 1);
497   const int in_w_step = layer_config->skip_width * out_w_step;
498   for (int i = 0; i < layer_config->out_channels; ++i) {
499     for (int h = start_h, u = 0; h < in_height;
500          h += layer_config->skip_height, ++u) {
501       const int in_h = h * in_stride;
502       const int out_h = u * out_stride + start_idx;
503       for (int w = start_w, out_index = out_h; w < in_width;
504            w += in_w_step, out_index += out_w_step) {
505         float sum = layer_config->bias[i];
506         for (int k = 0; k < layer_config->in_channels; ++k) {
507           sum += layer_config->weights[k * layer_config->out_channels + i] *
508                  input[k][in_h + w];
509         }
510         output[i][out_index] = sum;
511       }
512     }
513   }
514 }
515 
516 // CNNConvolve specific to maxpool set as 0 and padding equal to
517 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)518 static void convolve_no_maxpool_padding_zero(
519     const float **input, int in_width, int in_height, int in_stride,
520     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
521     int start_idx, const int cstep, const int filter_width_half,
522     const int filter_height_half, const int ii_shift, const int jj_shift,
523     const int channel_step) {
524   const int start_h = get_start_shift_convolve(
525       in_height, layer_config->filter_height, layer_config->skip_height);
526   const int start_w = get_start_shift_convolve(
527       in_width, layer_config->filter_width, layer_config->skip_width);
528   const int end_ii_shift = filter_height_half + 1;
529   const int end_jj_shift = filter_width_half + 1;
530   // *_filter_margin stores the number of pixels along a dimension in the
531   // intersection of the complement of the image in the extended image
532   // and the filter.
533   const int top_filter_margin = layer_config->filter_width * ii_shift;
534   const int right_filter_margin = end_jj_shift - in_width;
535   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
536     for (int h = start_h, u = 0; h < in_height;
537          h += layer_config->skip_height, ++u) {
538       const int out_h = u * out_stride;
539       const int top_cstep =
540           AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
541               cstep +
542           i;
543       const int start_ii = AOMMAX(0, h - ii_shift);
544       const int end_ii = AOMMIN(in_height, h + end_ii_shift);
545       for (int w = start_w, out_index = out_h; w < in_width;
546            w += layer_config->skip_width, ++out_index) {
547         const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
548         const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
549         const int start_jj = AOMMAX(0, w - jj_shift);
550         const int end_jj = AOMMIN(in_width, w + end_jj_shift);
551         float sum = layer_config->bias[i];
552         for (int k = 0; k < layer_config->in_channels; ++k) {
553           int off = k * layer_config->out_channels + top_cstep;
554           for (int ii = start_ii; ii < end_ii; ++ii) {
555             off += left_cstep;
556             for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
557               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
558             }
559             off += right_cstep;
560           }
561         }
562         output[i][out_index] = sum;
563       }
564     }
565   }
566 }
567 
568 // CNNConvolve specific to maxpool set as 0 and padding equal to
569 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)570 static void convolve_no_maxpool_padding_replicate(
571     const float **input, int in_width, int in_height, int in_stride,
572     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
573     int start_idx, const int cstep, const int ii_shift, const int jj_shift,
574     const int channel_step) {
575   // h and w are shifted to an offset coordinate system to reduce in-loop
576   // computation.
577   const int start_h =
578       get_start_shift_convolve(in_height, layer_config->filter_height,
579                                layer_config->skip_height) -
580       ii_shift;
581   const int start_w =
582       get_start_shift_convolve(in_width, layer_config->filter_width,
583                                layer_config->skip_width) -
584       jj_shift;
585   const int end_h = in_height - ii_shift;
586   const int end_w = in_width - jj_shift;
587   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
588     for (int h = start_h, u = 0; h < end_h;
589          h += layer_config->skip_height, ++u) {
590       const int out_h = u * out_stride;
591       const int upper_ii_index = layer_config->filter_height + h;
592       for (int w = start_w, out_index = out_h; w < end_w;
593            w += layer_config->skip_width, ++out_index) {
594         const int upper_jj_index = layer_config->filter_width + w;
595         float sum = layer_config->bias[i];
596         for (int k = 0; k < layer_config->in_channels; ++k) {
597           int off = k * layer_config->out_channels + i;
598           for (int ii = h; ii < upper_ii_index; ++ii) {
599             const int clamped_ii = CLAMPINDEX(ii, in_height);
600             for (int jj = w; jj < upper_jj_index; ++jj) {
601               const int clamped_jj = CLAMPINDEX(jj, in_width);
602               assert(clamped_ii >= 0 && clamped_ii < in_height &&
603                      clamped_jj >= 0 && clamped_jj < in_width);
604               sum += layer_config->weights[off] *
605                      input[k][clamped_ii * in_stride + clamped_jj];
606               off += cstep;
607             }
608           }
609         }
610         output[i][out_index] = sum;
611       }
612     }
613   }
614 }
615 
616 // CNNConvolve specific to maxpool set as 0 and padding equal to
617 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)618 void av1_cnn_convolve_no_maxpool_padding_valid_c(
619     const float **input, int in_width, int in_height, int in_stride,
620     const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
621     int start_idx, int cstep, int channel_step) {
622   assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
623          !layer_config->maxpool);
624   assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
625   assert(layer_config->pad == PADDING_VALID);
626   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
627     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
628          h += layer_config->skip_height, ++u) {
629       const int out_h = u * out_stride;
630       const int upper_ii_index = layer_config->filter_height + h;
631       for (int w = 0, out_index = out_h;
632            w < in_width - layer_config->filter_width + 1;
633            w += layer_config->skip_width, ++out_index) {
634         const int upper_jj_index = layer_config->filter_width + w;
635         float sum = layer_config->bias[i];
636         for (int k = 0; k < layer_config->in_channels; ++k) {
637           int off = k * layer_config->out_channels + i;
638           for (int ii = h; ii < upper_ii_index; ++ii) {
639             for (int jj = w; jj < upper_jj_index; ++jj) {
640               assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
641               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
642               off += cstep;
643             }
644           }
645         }
646         output[i][out_index] = sum;
647       }
648     }
649   }
650 }
651 
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)652 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
653                              int in_stride,
654                              const CNN_LAYER_CONFIG *layer_config,
655                              float **output, int out_stride, int start_idx,
656                              int step) {
657   assert(!layer_config->deconvolve);
658   const int cstep = layer_config->in_channels * layer_config->out_channels;
659   const int filter_height_half = layer_config->filter_height >> 1;
660   const int filter_width_half = layer_config->filter_width >> 1;
661   const int channel_step = AOMMAX(step, 1);
662 
663   if (layer_config->maxpool &&
664       (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
665     switch (layer_config->pad) {
666       case PADDING_SAME_ZERO:
667         convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
668                                       layer_config, output, out_stride, cstep,
669                                       filter_width_half, filter_height_half);
670         break;
671       case PADDING_SAME_REPLICATE:
672         convolve_maxpool_padding_replicate(
673             input, in_width, in_height, in_stride, layer_config, output,
674             out_stride, cstep, filter_width_half, filter_height_half);
675         break;
676       case PADDING_VALID:
677         convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
678                                        layer_config, output, out_stride, cstep);
679         break;
680       default: assert(0 && "Unknown padding type");
681     }
682   } else {
683     // Results in element-wise matrix multiplication.
684     if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
685       convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
686                             output, out_stride, start_idx, step);
687       return;
688     }
689     const int ii_shift =
690         filter_height_half - (layer_config->filter_height - 1) % 2;
691     const int jj_shift =
692         filter_width_half - (layer_config->filter_width - 1) % 2;
693     switch (layer_config->pad) {
694       case PADDING_SAME_ZERO:
695         convolve_no_maxpool_padding_zero(
696             input, in_width, in_height, in_stride, layer_config, output,
697             out_stride, start_idx, cstep, filter_width_half, filter_height_half,
698             ii_shift, jj_shift, channel_step);
699         break;
700       case PADDING_SAME_REPLICATE:
701         convolve_no_maxpool_padding_replicate(
702             input, in_width, in_height, in_stride, layer_config, output,
703             out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
704         break;
705       case PADDING_VALID:
706         av1_cnn_convolve_no_maxpool_padding_valid(
707             input, in_width, in_height, in_stride, layer_config, output,
708             out_stride, start_idx, cstep, channel_step);
709         break;
710       default: assert(0 && "Unknown padding type");
711     }
712   }
713 }
714 
convolve_layer(void * arg1,void * arg2)715 static int convolve_layer(void *arg1, void *arg2) {
716   const CONVOLVE_OPS *convolve_ops = arg1;
717   (void)arg2;
718   av1_cnn_convolve(
719       convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
720       convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
721       convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
722   return 1;
723 }
724 
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)725 static void convolve_layer_mt(const float **input, int in_width, int in_height,
726                               int in_stride,
727                               const CNN_LAYER_CONFIG *layer_config,
728                               const CNN_THREAD_DATA *thread_data,
729                               float **output, int out_stride) {
730   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
731   const int num_workers = thread_data->num_workers;
732   assert(thread_data->workers);
733 
734   CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
735   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
736     AVxWorker *const worker = &thread_data->workers[th];
737     winterface->reset(worker);
738 
739     CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
740                                  in_stride,  layer_config, output,
741                                  out_stride, th,           num_workers };
742     convolve_ops[th] = convolve_op;
743     worker->hook = convolve_layer;
744     worker->data1 = &(convolve_ops[th]);
745     worker->data2 = NULL;
746 
747     // Start convolving.
748     if (th == num_workers - 1) {
749       winterface->execute(worker);
750     } else {
751       winterface->launch(worker);
752     }
753   }
754 
755   // Wait until all workers have finished.
756   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
757     winterface->sync(&thread_data->workers[th]);
758   }
759 }
760 
get_start_shift_deconvolve(int filt_width,int stride)761 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
762   const int dif = AOMMAX(filt_width - stride, 0);
763   return dif / 2;
764 }
765 
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)766 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
767                          int stride, const float *gamma, const float *beta,
768                          const float *mean, const float *std) {
769   assert(gamma && beta && beta && std && "batchnorm has null parameter!");
770   for (int ch = 0; ch < channels; ch++) {
771     const float ch_gamma = gamma[ch];
772     const float ch_beta = beta[ch];
773     const float ch_mean = mean[ch];
774     const float ch_std = std[ch];
775     float *image_row = image[ch];
776 
777     for (int row = 0; row < height; row++) {
778       for (int col = 0; col < width; col++) {
779         image_row[col] =
780             ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
781       }
782       image_row += stride;
783     }
784   }
785 }
786 
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)787 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
788                           int in_stride, const CNN_LAYER_CONFIG *layer_config,
789                           float **output, int out_stride) {
790   assert(layer_config->deconvolve);
791 
792   const int cstep = layer_config->in_channels * layer_config->out_channels;
793 
794   int out_width = 0;
795   int out_height = 0;
796   av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
797                                  &out_height);
798   switch (layer_config->pad) {
799     case PADDING_SAME_ZERO:
800       for (int i = 0; i < layer_config->out_channels; ++i) {
801         for (int u = 0; u < out_height; ++u) {
802           for (int v = 0; v < out_width; ++v) {
803             float sum = layer_config->bias[i];
804             for (int k = 0; k < layer_config->in_channels; ++k) {
805               int off = k * layer_config->out_channels + i;
806               for (int l = 0; l < layer_config->filter_height; ++l) {
807                 const int h =
808                     u - l +
809                     get_start_shift_deconvolve(layer_config->filter_height,
810                                                layer_config->skip_height);
811                 for (int m = 0; m < layer_config->filter_width;
812                      ++m, off += cstep) {
813                   const int w =
814                       v - m +
815                       get_start_shift_deconvolve(layer_config->filter_width,
816                                                  layer_config->skip_width);
817                   if ((h % layer_config->skip_height) != 0 ||
818                       (w % layer_config->skip_width) != 0)
819                     continue;
820                   const int ii = h / layer_config->skip_height;
821                   const int jj = w / layer_config->skip_width;
822                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
823                     continue;
824                   sum += layer_config->weights[off] *
825                          input[k][ii * in_stride + jj];
826                 }
827               }
828             }
829             output[i][u * out_stride + v] = sum;
830           }
831         }
832       }
833       break;
834     case PADDING_SAME_REPLICATE:
835       for (int i = 0; i < layer_config->out_channels; ++i) {
836         for (int u = 0; u < out_height; ++u) {
837           for (int v = 0; v < out_width; ++v) {
838             float sum = layer_config->bias[i];
839             for (int k = 0; k < layer_config->in_channels; ++k) {
840               int off = k * layer_config->out_channels + i;
841               for (int l = 0; l < layer_config->filter_height; ++l) {
842                 const int h =
843                     u - l +
844                     get_start_shift_deconvolve(layer_config->filter_height,
845                                                layer_config->skip_height);
846                 for (int m = 0; m < layer_config->filter_width;
847                      ++m, off += cstep) {
848                   const int w =
849                       v - m +
850                       get_start_shift_deconvolve(layer_config->filter_width,
851                                                  layer_config->skip_width);
852                   if ((h % layer_config->skip_height) != 0 ||
853                       (w % layer_config->skip_width) != 0)
854                     continue;
855                   const int ii =
856                       CLAMPINDEX(h / layer_config->skip_height, in_height);
857                   const int jj =
858                       CLAMPINDEX(w / layer_config->skip_width, in_width);
859                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
860                   sum += layer_config->weights[off] *
861                          input[k][ii * in_stride + jj];
862                 }
863               }
864             }
865             output[i][u * out_stride + v] = sum;
866           }
867         }
868       }
869       break;
870     case PADDING_VALID:
871       for (int i = 0; i < layer_config->out_channels; ++i) {
872         for (int u = 0; u < out_height; ++u) {
873           for (int v = 0; v < out_width; ++v) {
874             float sum = layer_config->bias[i];
875             for (int k = 0; k < layer_config->in_channels; ++k) {
876               int off = k * layer_config->out_channels + i;
877               for (int l = 0; l < layer_config->filter_height; ++l) {
878                 const int h = u - l;
879                 for (int m = 0; m < layer_config->filter_width;
880                      ++m, off += cstep) {
881                   const int w = v - m;
882                   if ((h % layer_config->skip_height) != 0 ||
883                       (w % layer_config->skip_width) != 0)
884                     continue;
885                   const int ii = h / layer_config->skip_height;
886                   const int jj = w / layer_config->skip_width;
887                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
888                     continue;
889                   sum += layer_config->weights[off] *
890                          input[k][ii * in_stride + jj];
891                 }
892               }
893             }
894             output[i][u * out_stride + v] = sum;
895           }
896         }
897       }
898       break;
899     default: assert(0 && "Unknown padding type");
900   }
901 }
902 
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)903 bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
904                        int in_stride, const CNN_CONFIG *cnn_config,
905                        const CNN_THREAD_DATA *thread_data,
906                        CNN_MULTI_OUT *output_struct) {
907   bool success = false;
908   TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
909   TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
910 
911   float **output[CNN_MAX_BRANCHES];
912   const int *out_chs = output_struct->output_channels;
913   output[0] = output_struct->output_buffer;
914   for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
915     output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
916   }
917 
918   int i_width = in_width;
919   int i_height = in_height;
920   int o_width = 0, o_height = 0;
921   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
922     init_tensor(&tensor1[b]);
923     init_tensor(&tensor2[b]);
924   }
925 
926   const int *out_stride = output_struct->output_strides;
927   for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
928     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
929     const int branch = layer_config->branch;
930     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
931 
932     // Allocate input tensor
933     if (layer == 0) {       // First layer
934       assert(branch == 0);  // First layer must be primary branch
935       assign_tensor(&tensor1[branch], (float **)input,
936                     layer_config->in_channels, in_width, in_height, in_stride);
937     } else {  // Non-first layer
938       // Swap tensor1 and tensor2
939       swap_tensor(&tensor1[branch], &tensor2[branch]);
940 
941       i_width = tensor1[branch].width;
942       i_height = tensor1[branch].height;
943     }
944 
945     // Allocate output tensor
946     av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
947                                    &o_height);
948     const int output_num = layer_config->output_num;
949     if (output_num == -1) {  // Non-output layer
950       if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
951                           o_height)) {
952         goto Error;
953       }
954     } else {  // Output layer
955       free_tensor(&tensor2[branch]);
956       assign_tensor(&tensor2[branch], output[output_num],
957                     layer_config->out_channels, o_width, o_height,
958                     out_stride[output_num]);
959     }
960 
961     // If we are combining branches make sure that the branch to combine
962     // is different from the current branch.
963     assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
964                    !(branch_config->branches_to_combine & (1 << branch))));
965 
966     if (layer_config->branch_copy_type == BRANCH_INPUT) {
967       if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
968                                           branch, tensor2)) {
969         goto Error;
970       }
971     }
972     // Check consistency of input and output channels
973     assert(tensor1[branch].channels == layer_config->in_channels);
974     assert(tensor2[branch].channels == layer_config->out_channels);
975 
976     // Convolve/Deconvolve
977     if (!cnn_config->layer_config[layer].deconvolve) {
978       if (thread_data->num_workers > 1) {
979         convolve_layer_mt((const float **)tensor1[branch].buf,
980                           tensor1[branch].width, tensor1[branch].height,
981                           tensor1[branch].stride, layer_config, thread_data,
982                           tensor2[branch].buf, tensor2[branch].stride);
983       } else {
984         av1_cnn_convolve((const float **)tensor1[branch].buf,
985                          tensor1[branch].width, tensor1[branch].height,
986                          tensor1[branch].stride, layer_config,
987                          tensor2[branch].buf, tensor2[branch].stride, 0, 1);
988       }
989     } else {
990       av1_cnn_deconvolve((const float **)tensor1[branch].buf,
991                          tensor1[branch].width, tensor1[branch].height,
992                          tensor1[branch].stride, layer_config,
993                          tensor2[branch].buf, tensor2[branch].stride);
994     }
995 
996     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
997       if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
998                                           branch, tensor2)) {
999         goto Error;
1000       }
1001     }
1002 
1003     // Add tensors from other branches if needed
1004     if (layer_config->branch_combine_type == BRANCH_ADD) {
1005       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1006         if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1007           assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
1008           av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
1009                       tensor2[branch].width, tensor2[branch].height,
1010                       tensor2[branch].stride, (const float **)tensor2[b].buf);
1011         }
1012       }
1013     }
1014 
1015     // Non-linearity
1016     if (layer_config->activation != IDENTITY)
1017       av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1018                        tensor2[branch].width, tensor2[branch].height,
1019                        tensor2[branch].stride, layer_config->activation);
1020 
1021     if (layer_config->bn_params.bn_gamma) {
1022       av1_cnn_batchnorm(
1023           tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1024           tensor2[branch].height, tensor2[branch].stride,
1025           layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1026           layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1027     }
1028 
1029     // Concatenate tensors
1030     if (layer_config->branch_combine_type == BRANCH_CAT) {
1031       if (output_num == -1) {  // Non-output layer
1032         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1033           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1034             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1035             assert(tensor2[b].channels > 0);
1036             if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
1037           }
1038         }
1039       } else {  // Output layer
1040         const int existing_channels = tensor2[branch].channels;
1041         int num_chs = existing_channels;
1042         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1043           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1044             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1045             // Needed only to assign the new channel buffers
1046             num_chs += tensor2[b].channels;
1047           }
1048         }
1049         assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1050                       o_height, out_stride[output_num]);
1051 
1052         num_chs = existing_channels;
1053         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1054           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1055             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1056             // Needed only to assign the new channel buffers
1057             copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1058                         &tensor2[branch]);
1059             num_chs += tensor2[b].channels;
1060           }
1061         }
1062       }
1063     }
1064 
1065     if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1066       if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
1067                                           branch, tensor2)) {
1068         goto Error;
1069       }
1070     }
1071   }
1072 
1073   success = true;
1074 Error:
1075   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1076     free_tensor(&tensor1[b]);
1077     free_tensor(&tensor2[b]);
1078   }
1079   return success;
1080 }
1081 
1082 // Assume output already has proper allocation
1083 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1084 bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1085                                    int stride, const CNN_CONFIG *cnn_config,
1086                                    const CNN_THREAD_DATA *thread_data,
1087                                    CNN_MULTI_OUT *output) {
1088   const float max_val = 255.0;
1089 
1090   const int in_width = width + 2 * cnn_config->ext_width;
1091   const int in_height = height + 2 * cnn_config->ext_height;
1092   const int in_channels = cnn_config->layer_config[0].in_channels;
1093   float *inputs[CNN_MAX_CHANNELS];
1094   float *input_ =
1095       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1096   if (!input_) return false;
1097   const int in_stride = in_width;
1098 
1099   for (int c = 0; c < in_channels; ++c) {
1100     inputs[c] = input_ + c * in_stride * in_height;
1101     float *input =
1102         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1103 
1104     if (cnn_config->strict_bounds) {
1105       for (int i = 0; i < height; ++i)
1106         for (int j = 0; j < width; ++j)
1107           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1108       // extend left and right
1109       for (int i = 0; i < height; ++i) {
1110         for (int j = -cnn_config->ext_width; j < 0; ++j)
1111           input[i * in_stride + j] = input[i * in_stride];
1112         for (int j = width; j < width + cnn_config->ext_width; ++j)
1113           input[i * in_stride + j] = input[i * in_stride + width - 1];
1114       }
1115       // extend top and bottom
1116       for (int i = -cnn_config->ext_height; i < 0; ++i)
1117         memcpy(&input[i * in_stride - cnn_config->ext_width],
1118                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1119       for (int i = height; i < height + cnn_config->ext_height; ++i)
1120         memcpy(&input[i * in_stride - cnn_config->ext_width],
1121                &input[(height - 1) * in_stride - cnn_config->ext_width],
1122                in_width * sizeof(*input));
1123     } else {
1124       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1125            ++i)
1126         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1127              ++j)
1128           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1129     }
1130   }
1131   bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1132                                  in_stride, cnn_config, thread_data, output);
1133 
1134   aom_free(input_);
1135   return success;
1136 }
1137 
1138 // Assume output already has proper allocation
1139 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1140 bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1141                                           int stride,
1142                                           const CNN_CONFIG *cnn_config,
1143                                           const CNN_THREAD_DATA *thread_data,
1144                                           int bit_depth,
1145                                           CNN_MULTI_OUT *output) {
1146   const float max_val = (float)((1 << bit_depth) - 1);
1147 
1148   const int in_width = width + 2 * cnn_config->ext_width;
1149   const int in_height = height + 2 * cnn_config->ext_height;
1150   const int in_channels = cnn_config->layer_config[0].in_channels;
1151   float *inputs[CNN_MAX_CHANNELS];
1152   float *input_ =
1153       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1154   if (!input_) return false;
1155   const int in_stride = in_width;
1156 
1157   for (int c = 0; c < in_channels; ++c) {
1158     inputs[c] = input_ + c * in_stride * in_height;
1159     float *input =
1160         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1161 
1162     if (cnn_config->strict_bounds) {
1163       for (int i = 0; i < height; ++i)
1164         for (int j = 0; j < width; ++j)
1165           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1166       // extend left and right
1167       for (int i = 0; i < height; ++i) {
1168         for (int j = -cnn_config->ext_width; j < 0; ++j)
1169           input[i * in_stride + j] = input[i * in_stride];
1170         for (int j = width; j < width + cnn_config->ext_width; ++j)
1171           input[i * in_stride + j] = input[i * in_stride + width - 1];
1172       }
1173       // extend top and bottom
1174       for (int i = -cnn_config->ext_height; i < 0; ++i)
1175         memcpy(&input[i * in_stride - cnn_config->ext_width],
1176                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1177       for (int i = height; i < height + cnn_config->ext_height; ++i)
1178         memcpy(&input[i * in_stride - cnn_config->ext_width],
1179                &input[(height - 1) * in_stride - cnn_config->ext_width],
1180                in_width * sizeof(*input));
1181     } else {
1182       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1183            ++i)
1184         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1185              ++j)
1186           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1187     }
1188   }
1189 
1190   bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1191                                  in_stride, cnn_config, thread_data, output);
1192 
1193   aom_free(input_);
1194   return success;
1195 }
1196 
1197 // Assume output already has proper allocation
1198 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)1199 bool av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
1200                          const CNN_CONFIG *cnn_config,
1201                          const CNN_THREAD_DATA *thread_data, float **output,
1202                          int out_stride) {
1203   int out_width = 0, out_height = 0, out_channels = 0;
1204   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1205                            &out_channels);
1206   const int output_chs[1] = { out_channels };
1207   const int output_strides[1] = { out_stride };
1208   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1209                                   .output_strides = output_strides,
1210                                   .output_buffer = output };
1211   return av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
1212                                        thread_data, &output_struct);
1213 }
1214 
1215 // Assume output already has proper allocation
1216 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,float ** output,int out_stride)1217 bool av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
1218                                 int stride, const CNN_CONFIG *cnn_config,
1219                                 const CNN_THREAD_DATA *thread_data,
1220                                 int bit_depth, float **output, int out_stride) {
1221   int out_width = 0, out_height = 0, out_channels = 0;
1222   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1223                            &out_channels);
1224   const int output_chs[1] = { out_channels };
1225   const int output_strides[1] = { out_stride };
1226   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1227                                   .output_strides = output_strides,
1228                                   .output_buffer = output };
1229   return av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride,
1230                                               cnn_config, thread_data,
1231                                               bit_depth, &output_struct);
1232 }
1233