1 /*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <math.h>
14 #include <stdbool.h>
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "av1/common/av1_common_int.h"
18 #include "av1/encoder/cnn.h"
19
20 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
21
22 typedef struct {
23 const float **input;
24 int in_width;
25 int in_height;
26 int in_stride;
27 const CNN_LAYER_CONFIG *layer_config;
28 float **output;
29 int out_stride;
30 int start_idx;
31 int th_step;
32 } CONVOLVE_OPS;
33
34 typedef float (*activation_fn)(float);
35
softsign(float x)36 static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
37
relu(float x)38 static float relu(float x) { return (x < 0) ? 0 : x; }
39
identity(float x)40 static float identity(float x) { return x; }
41
42 typedef struct {
43 int allocsize;
44 int channels;
45 int width, height, stride;
46 float *buf[CNN_MAX_CHANNELS];
47 } TENSOR;
48
init_tensor(TENSOR * tensor)49 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
50
free_tensor(TENSOR * tensor)51 static void free_tensor(TENSOR *tensor) {
52 if (tensor->allocsize) {
53 aom_free(tensor->buf[0]);
54 tensor->buf[0] = NULL;
55 tensor->allocsize = 0;
56 }
57 }
58
realloc_tensor(TENSOR * tensor,int channels,int width,int height)59 static bool realloc_tensor(TENSOR *tensor, int channels, int width,
60 int height) {
61 const int newallocsize = channels * width * height;
62 if (tensor->allocsize < newallocsize) {
63 free_tensor(tensor);
64 tensor->buf[0] =
65 (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
66 if (!tensor->buf[0]) return false;
67 tensor->allocsize = newallocsize;
68 }
69 tensor->width = width;
70 tensor->height = height;
71 tensor->stride = width;
72 tensor->channels = channels;
73 for (int c = 1; c < channels; ++c)
74 tensor->buf[c] = &tensor->buf[0][c * width * height];
75 return true;
76 }
77
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)78 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
79 TENSOR *dst) {
80 assert(src->width == dst->width);
81 assert(src->height == dst->height);
82 assert(copy_channels <= src->channels);
83 if (src->stride == dst->width && dst->stride == dst->width) {
84 for (int c = 0; c < copy_channels; ++c) {
85 memcpy(dst->buf[dst_offset + c], src->buf[c],
86 sizeof(*dst->buf[0]) * src->width * src->height);
87 }
88 } else {
89 for (int c = 0; c < copy_channels; ++c) {
90 for (int r = 0; r < dst->height; ++r) {
91 memcpy(&dst->buf[dst_offset + c][r * dst->stride],
92 &src->buf[c][r * src->stride],
93 dst->width * sizeof(*dst->buf[c]));
94 }
95 }
96 }
97 }
98
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)99 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
100 int channels, int width, int height, int stride) {
101 tensor->allocsize = 0;
102 tensor->channels = channels;
103 tensor->width = width;
104 tensor->height = height;
105 tensor->stride = stride;
106 if (buf) {
107 for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
108 } else {
109 for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
110 }
111 }
112
swap_tensor(TENSOR * t1,TENSOR * t2)113 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
114 TENSOR t = *t1;
115 *t1 = *t2;
116 *t2 = t;
117 }
118
119 // The concatenated tensor goes into dst with first the channels in
120 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)121 static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
122 assert(src->width == dst->width);
123 assert(src->height == dst->height);
124
125 const int dst_channels = dst->channels;
126 const int channels = dst->channels + src->channels;
127 const int newallocsize = channels * dst->width * dst->height;
128 if (dst->allocsize < newallocsize) {
129 TENSOR t;
130 init_tensor(&t);
131 // allocate new buffers and copy first the dst channels
132 if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
133 copy_tensor(dst, dst->channels, 0, &t);
134 // Swap the tensors and free the old buffers
135 swap_tensor(dst, &t);
136 free_tensor(&t);
137 }
138 for (int c = 1; c < channels; ++c)
139 dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
140 // Copy the channels in src after the first dst_channels channels.
141 copy_tensor(src, src->channels, dst_channels, dst);
142 return true;
143 }
144
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
146 return (t1->width == t2->width && t1->height == t2->height);
147 }
148
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)149 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
150 return (t1->channels == t2->channels && t1->width == t2->width &&
151 t1->height == t2->height);
152 }
153
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)154 void av1_find_cnn_layer_output_size(int in_width, int in_height,
155 const CNN_LAYER_CONFIG *layer_config,
156 int *out_width, int *out_height) {
157 assert(layer_config->skip_width > 0);
158 assert(layer_config->skip_height > 0);
159 if (!layer_config->deconvolve) {
160 switch (layer_config->pad) {
161 case PADDING_SAME_ZERO:
162 case PADDING_SAME_REPLICATE:
163 *out_width = (in_width + layer_config->skip_width - 1) /
164 layer_config->skip_width;
165 *out_height = (in_height + layer_config->skip_height - 1) /
166 layer_config->skip_height;
167 break;
168 case PADDING_VALID:
169 *out_width =
170 (in_width - layer_config->filter_width + layer_config->skip_width) /
171 layer_config->skip_width;
172 *out_height = (in_height - layer_config->filter_height +
173 layer_config->skip_height) /
174 layer_config->skip_height;
175 break;
176 default: assert(0 && "Unknown padding type");
177 }
178 } else {
179 switch (layer_config->pad) {
180 case PADDING_SAME_ZERO:
181 case PADDING_SAME_REPLICATE:
182 *out_width = in_width * layer_config->skip_width;
183 *out_height = in_height * layer_config->skip_height;
184 break;
185 case PADDING_VALID:
186 *out_width = (in_width - 1) * layer_config->skip_width +
187 layer_config->filter_width;
188 *out_height = (in_height - 1) * layer_config->skip_height +
189 layer_config->filter_height;
190 break;
191 default: assert(0 && "Unknown padding type");
192 }
193 }
194 }
195
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])196 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
197 int channels_per_branch[]) {
198 int branch = layer_config->branch;
199 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
200 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
201 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
202 if (layer_config->branch_copy_type == BRANCH_INPUT) {
203 channels_per_branch[b] = layer_config->in_channels;
204 } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
205 channels_per_branch[b] = layer_config->out_channels;
206 } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
207 channels_per_branch[b] = layer_config->out_channels;
208 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
209 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
210 assert(channels_per_branch[c] > 0);
211 channels_per_branch[b] += channels_per_branch[c];
212 }
213 }
214 }
215 }
216 }
217 channels_per_branch[branch] = layer_config->out_channels;
218 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
219 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
220 assert(channels_per_branch[c] > 0);
221 channels_per_branch[branch] += channels_per_branch[c];
222 }
223 }
224 }
225
226 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)227 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
228 const int num_layers = cnn_config->num_layers;
229 const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
230
231 for (int idx = 0; idx < num_layers; idx++) {
232 if (layer_configs[idx].output_num != -1) {
233 return 1;
234 }
235 }
236 return 0;
237 }
238 #endif
239
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)240 void av1_find_cnn_output_size(int in_width, int in_height,
241 const CNN_CONFIG *cnn_config, int *out_width,
242 int *out_height, int *out_channels) {
243 int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
244 int i_width[CNN_MAX_BRANCHES] = { 0 };
245 int i_height[CNN_MAX_BRANCHES] = { 0 };
246 i_width[0] = in_width + cnn_config->ext_width * 2;
247 i_height[0] = in_height + cnn_config->ext_height * 2;
248
249 #if CONFIG_DEBUG
250 assert(cnn_has_at_least_one_output(cnn_config));
251 #endif
252
253 for (int i = 0; i < cnn_config->num_layers; ++i) {
254 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
255 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
256 const int branch = layer_config->branch;
257 int o_width = 0, o_height = 0;
258
259 if (layer_config->branch_copy_type == BRANCH_INPUT) {
260 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
261 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
262 assert(i_width[branch] > 0 && i_height[branch] > 0);
263 i_width[b] = i_width[branch];
264 i_height[b] = i_height[branch];
265 }
266 }
267 }
268
269 av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
270 layer_config, &o_width, &o_height);
271 i_width[branch] = o_width;
272 i_height[branch] = o_height;
273
274 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
275 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
276 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
277 i_width[b] = o_width;
278 i_height[b] = o_height;
279 }
280 }
281 }
282
283 find_cnn_out_channels(layer_config, channels_per_branch);
284
285 const int output_num = layer_config->output_num;
286 if (output_num != -1) { // Current layer is an output layer
287 out_width[output_num] = o_width;
288 out_height[output_num] = o_height;
289 out_channels[output_num] = channels_per_branch[layer_config->branch];
290 }
291 }
292 }
293
get_activation(ACTIVATION layer_activation)294 activation_fn get_activation(ACTIVATION layer_activation) {
295 switch (layer_activation) {
296 case NONE: return identity;
297 case RELU: return relu;
298 case SOFTSIGN: return softsign;
299 case SIGMOID:
300 assert(0 && "Sigmoid has not been supported in CNN."); // TO DO
301 return NULL;
302 default: assert(0 && "Unknown activation type"); return NULL;
303 }
304 }
305
get_start_shift_convolve(int width,int filt_width,int stride)306 static INLINE int get_start_shift_convolve(int width, int filt_width,
307 int stride) {
308 const int mod = (width % stride);
309 const int filt_off = (filt_width - 1) / 2;
310 const int dif = (mod ? mod - 1 : stride - 1);
311 return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
312 }
313
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)314 void av1_cnn_add_c(float **output, int channels, int width, int height,
315 int stride, const float **add) {
316 for (int c = 0; c < channels; ++c) {
317 for (int i = 0; i < height; ++i)
318 for (int j = 0; j < width; ++j)
319 output[c][i * stride + j] += add[c][i * stride + j];
320 }
321 }
322
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)323 void av1_cnn_activate_c(float **output, int channels, int width, int height,
324 int stride, ACTIVATION layer_activation) {
325 activation_fn activation = get_activation(layer_activation);
326 for (int c = 0; c < channels; ++c) {
327 for (int i = 0; i < height; ++i)
328 for (int j = 0; j < width; ++j)
329 output[c][i * stride + j] = activation(output[c][i * stride + j]);
330 }
331 }
332
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])333 static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
334 const CNN_LAYER_CONFIG *layer_config,
335 int branch, TENSOR branch_output[]) {
336 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
337 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
338 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
339 // Copy layer's active tensor to output tensor of branch b if set in
340 // mask. The output becomes the input of the first layer of the branch
341 // because the layer of the branch is not the first layer.
342 int copy_channels = branch_config->channels_to_copy > 0
343 ? branch_config->channels_to_copy
344 : layer_active_tensor->channels;
345 if (!realloc_tensor(&branch_output[b], copy_channels,
346 layer_active_tensor->width,
347 layer_active_tensor->height)) {
348 return false;
349 }
350 copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
351 }
352 }
353 return true;
354 }
355
356 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
357 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)358 static void convolve_maxpool_padding_zero(
359 const float **input, int in_width, int in_height, int in_stride,
360 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
361 const int cstep, const int filter_width_half,
362 const int filter_height_half) {
363 for (int i = 0; i < layer_config->out_channels; ++i) {
364 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
365 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
366 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
367 ++hh) {
368 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
369 ++ww) {
370 float sum = layer_config->bias[i];
371 for (int k = 0; k < layer_config->in_channels; ++k) {
372 int off = k * layer_config->out_channels + i;
373 for (int l = 0; l < layer_config->filter_height; ++l) {
374 const int ii = hh + l - filter_height_half;
375 for (int m = 0; m < layer_config->filter_width;
376 ++m, off += cstep) {
377 const int jj = ww + m - filter_width_half;
378 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
379 continue;
380 sum += layer_config->weights[off] *
381 input[k][ii * in_stride + jj];
382 }
383 }
384 }
385 const float a = sum;
386 if (h == hh && w == ww)
387 output[i][u * out_stride + v] = a;
388 else
389 output[i][u * out_stride + v] =
390 AOMMAX(output[i][u * out_stride + v], a);
391 }
392 }
393 }
394 }
395 }
396 }
397
398 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
399 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)400 static void convolve_maxpool_padding_replicate(
401 const float **input, int in_width, int in_height, int in_stride,
402 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
403 const int cstep, const int filter_width_half,
404 const int filter_height_half) {
405 for (int i = 0; i < layer_config->out_channels; ++i) {
406 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
407 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
408 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
409 ++hh) {
410 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
411 ++ww) {
412 float sum = layer_config->bias[i];
413 for (int k = 0; k < layer_config->in_channels; ++k) {
414 int off = k * layer_config->out_channels + i;
415 for (int l = 0; l < layer_config->filter_height; ++l) {
416 const int ii =
417 CLAMPINDEX(hh + l - filter_height_half, in_height);
418 for (int m = 0; m < layer_config->filter_width;
419 ++m, off += cstep) {
420 const int jj =
421 CLAMPINDEX(ww + m - filter_width_half, in_width);
422 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
423 sum += layer_config->weights[off] *
424 input[k][ii * in_stride + jj];
425 }
426 }
427 }
428 const float a = sum;
429 if (h == hh && w == ww)
430 output[i][u * out_stride + v] = a;
431 else
432 output[i][u * out_stride + v] =
433 AOMMAX(output[i][u * out_stride + v], a);
434 }
435 }
436 }
437 }
438 }
439 }
440
441 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
442 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)443 static void convolve_maxpool_padding_valid(
444 const float **input, int in_width, int in_height, int in_stride,
445 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
446 const int cstep) {
447 for (int i = 0; i < layer_config->out_channels; ++i) {
448 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
449 h += layer_config->skip_height, ++u) {
450 for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
451 w += layer_config->skip_width, ++v) {
452 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
453 ++hh) {
454 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
455 ++ww) {
456 float sum = layer_config->bias[i];
457 for (int k = 0; k < layer_config->in_channels; ++k) {
458 int off = k * layer_config->out_channels + i;
459 for (int l = 0; l < layer_config->filter_height; ++l) {
460 const int ii = hh + l;
461 for (int m = 0; m < layer_config->filter_width;
462 ++m, off += cstep) {
463 const int jj = ww + m;
464 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
465 sum += layer_config->weights[off] *
466 input[k][ii * in_stride + jj];
467 }
468 }
469 }
470 const float a = sum;
471 if (h == hh && w == ww)
472 output[i][u * out_stride + v] = a;
473 else
474 output[i][u * out_stride + v] =
475 AOMMAX(output[i][u * out_stride + v], a);
476 }
477 }
478 }
479 }
480 }
481 }
482
483 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
484 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)485 static void convolve_element_wise(const float **input, int in_width,
486 int in_height, int in_stride,
487 const CNN_LAYER_CONFIG *const layer_config,
488 float **output, int out_stride, int start_idx,
489 int step) {
490 const int start_h = get_start_shift_convolve(
491 in_height, layer_config->filter_height, layer_config->skip_height);
492 const int start_w =
493 get_start_shift_convolve(in_width, layer_config->filter_width,
494 layer_config->skip_width) +
495 start_idx * layer_config->skip_width;
496 const int out_w_step = AOMMAX(step, 1);
497 const int in_w_step = layer_config->skip_width * out_w_step;
498 for (int i = 0; i < layer_config->out_channels; ++i) {
499 for (int h = start_h, u = 0; h < in_height;
500 h += layer_config->skip_height, ++u) {
501 const int in_h = h * in_stride;
502 const int out_h = u * out_stride + start_idx;
503 for (int w = start_w, out_index = out_h; w < in_width;
504 w += in_w_step, out_index += out_w_step) {
505 float sum = layer_config->bias[i];
506 for (int k = 0; k < layer_config->in_channels; ++k) {
507 sum += layer_config->weights[k * layer_config->out_channels + i] *
508 input[k][in_h + w];
509 }
510 output[i][out_index] = sum;
511 }
512 }
513 }
514 }
515
516 // CNNConvolve specific to maxpool set as 0 and padding equal to
517 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)518 static void convolve_no_maxpool_padding_zero(
519 const float **input, int in_width, int in_height, int in_stride,
520 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
521 int start_idx, const int cstep, const int filter_width_half,
522 const int filter_height_half, const int ii_shift, const int jj_shift,
523 const int channel_step) {
524 const int start_h = get_start_shift_convolve(
525 in_height, layer_config->filter_height, layer_config->skip_height);
526 const int start_w = get_start_shift_convolve(
527 in_width, layer_config->filter_width, layer_config->skip_width);
528 const int end_ii_shift = filter_height_half + 1;
529 const int end_jj_shift = filter_width_half + 1;
530 // *_filter_margin stores the number of pixels along a dimension in the
531 // intersection of the complement of the image in the extended image
532 // and the filter.
533 const int top_filter_margin = layer_config->filter_width * ii_shift;
534 const int right_filter_margin = end_jj_shift - in_width;
535 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
536 for (int h = start_h, u = 0; h < in_height;
537 h += layer_config->skip_height, ++u) {
538 const int out_h = u * out_stride;
539 const int top_cstep =
540 AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
541 cstep +
542 i;
543 const int start_ii = AOMMAX(0, h - ii_shift);
544 const int end_ii = AOMMIN(in_height, h + end_ii_shift);
545 for (int w = start_w, out_index = out_h; w < in_width;
546 w += layer_config->skip_width, ++out_index) {
547 const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
548 const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
549 const int start_jj = AOMMAX(0, w - jj_shift);
550 const int end_jj = AOMMIN(in_width, w + end_jj_shift);
551 float sum = layer_config->bias[i];
552 for (int k = 0; k < layer_config->in_channels; ++k) {
553 int off = k * layer_config->out_channels + top_cstep;
554 for (int ii = start_ii; ii < end_ii; ++ii) {
555 off += left_cstep;
556 for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
557 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
558 }
559 off += right_cstep;
560 }
561 }
562 output[i][out_index] = sum;
563 }
564 }
565 }
566 }
567
568 // CNNConvolve specific to maxpool set as 0 and padding equal to
569 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)570 static void convolve_no_maxpool_padding_replicate(
571 const float **input, int in_width, int in_height, int in_stride,
572 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
573 int start_idx, const int cstep, const int ii_shift, const int jj_shift,
574 const int channel_step) {
575 // h and w are shifted to an offset coordinate system to reduce in-loop
576 // computation.
577 const int start_h =
578 get_start_shift_convolve(in_height, layer_config->filter_height,
579 layer_config->skip_height) -
580 ii_shift;
581 const int start_w =
582 get_start_shift_convolve(in_width, layer_config->filter_width,
583 layer_config->skip_width) -
584 jj_shift;
585 const int end_h = in_height - ii_shift;
586 const int end_w = in_width - jj_shift;
587 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
588 for (int h = start_h, u = 0; h < end_h;
589 h += layer_config->skip_height, ++u) {
590 const int out_h = u * out_stride;
591 const int upper_ii_index = layer_config->filter_height + h;
592 for (int w = start_w, out_index = out_h; w < end_w;
593 w += layer_config->skip_width, ++out_index) {
594 const int upper_jj_index = layer_config->filter_width + w;
595 float sum = layer_config->bias[i];
596 for (int k = 0; k < layer_config->in_channels; ++k) {
597 int off = k * layer_config->out_channels + i;
598 for (int ii = h; ii < upper_ii_index; ++ii) {
599 const int clamped_ii = CLAMPINDEX(ii, in_height);
600 for (int jj = w; jj < upper_jj_index; ++jj) {
601 const int clamped_jj = CLAMPINDEX(jj, in_width);
602 assert(clamped_ii >= 0 && clamped_ii < in_height &&
603 clamped_jj >= 0 && clamped_jj < in_width);
604 sum += layer_config->weights[off] *
605 input[k][clamped_ii * in_stride + clamped_jj];
606 off += cstep;
607 }
608 }
609 }
610 output[i][out_index] = sum;
611 }
612 }
613 }
614 }
615
616 // CNNConvolve specific to maxpool set as 0 and padding equal to
617 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)618 void av1_cnn_convolve_no_maxpool_padding_valid_c(
619 const float **input, int in_width, int in_height, int in_stride,
620 const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
621 int start_idx, int cstep, int channel_step) {
622 assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
623 !layer_config->maxpool);
624 assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
625 assert(layer_config->pad == PADDING_VALID);
626 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
627 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
628 h += layer_config->skip_height, ++u) {
629 const int out_h = u * out_stride;
630 const int upper_ii_index = layer_config->filter_height + h;
631 for (int w = 0, out_index = out_h;
632 w < in_width - layer_config->filter_width + 1;
633 w += layer_config->skip_width, ++out_index) {
634 const int upper_jj_index = layer_config->filter_width + w;
635 float sum = layer_config->bias[i];
636 for (int k = 0; k < layer_config->in_channels; ++k) {
637 int off = k * layer_config->out_channels + i;
638 for (int ii = h; ii < upper_ii_index; ++ii) {
639 for (int jj = w; jj < upper_jj_index; ++jj) {
640 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
641 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
642 off += cstep;
643 }
644 }
645 }
646 output[i][out_index] = sum;
647 }
648 }
649 }
650 }
651
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)652 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
653 int in_stride,
654 const CNN_LAYER_CONFIG *layer_config,
655 float **output, int out_stride, int start_idx,
656 int step) {
657 assert(!layer_config->deconvolve);
658 const int cstep = layer_config->in_channels * layer_config->out_channels;
659 const int filter_height_half = layer_config->filter_height >> 1;
660 const int filter_width_half = layer_config->filter_width >> 1;
661 const int channel_step = AOMMAX(step, 1);
662
663 if (layer_config->maxpool &&
664 (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
665 switch (layer_config->pad) {
666 case PADDING_SAME_ZERO:
667 convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
668 layer_config, output, out_stride, cstep,
669 filter_width_half, filter_height_half);
670 break;
671 case PADDING_SAME_REPLICATE:
672 convolve_maxpool_padding_replicate(
673 input, in_width, in_height, in_stride, layer_config, output,
674 out_stride, cstep, filter_width_half, filter_height_half);
675 break;
676 case PADDING_VALID:
677 convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
678 layer_config, output, out_stride, cstep);
679 break;
680 default: assert(0 && "Unknown padding type");
681 }
682 } else {
683 // Results in element-wise matrix multiplication.
684 if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
685 convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
686 output, out_stride, start_idx, step);
687 return;
688 }
689 const int ii_shift =
690 filter_height_half - (layer_config->filter_height - 1) % 2;
691 const int jj_shift =
692 filter_width_half - (layer_config->filter_width - 1) % 2;
693 switch (layer_config->pad) {
694 case PADDING_SAME_ZERO:
695 convolve_no_maxpool_padding_zero(
696 input, in_width, in_height, in_stride, layer_config, output,
697 out_stride, start_idx, cstep, filter_width_half, filter_height_half,
698 ii_shift, jj_shift, channel_step);
699 break;
700 case PADDING_SAME_REPLICATE:
701 convolve_no_maxpool_padding_replicate(
702 input, in_width, in_height, in_stride, layer_config, output,
703 out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
704 break;
705 case PADDING_VALID:
706 av1_cnn_convolve_no_maxpool_padding_valid(
707 input, in_width, in_height, in_stride, layer_config, output,
708 out_stride, start_idx, cstep, channel_step);
709 break;
710 default: assert(0 && "Unknown padding type");
711 }
712 }
713 }
714
convolve_layer(void * arg1,void * arg2)715 static int convolve_layer(void *arg1, void *arg2) {
716 const CONVOLVE_OPS *convolve_ops = arg1;
717 (void)arg2;
718 av1_cnn_convolve(
719 convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
720 convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
721 convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
722 return 1;
723 }
724
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)725 static void convolve_layer_mt(const float **input, int in_width, int in_height,
726 int in_stride,
727 const CNN_LAYER_CONFIG *layer_config,
728 const CNN_THREAD_DATA *thread_data,
729 float **output, int out_stride) {
730 const AVxWorkerInterface *const winterface = aom_get_worker_interface();
731 const int num_workers = thread_data->num_workers;
732 assert(thread_data->workers);
733
734 CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
735 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
736 AVxWorker *const worker = &thread_data->workers[th];
737 winterface->reset(worker);
738
739 CONVOLVE_OPS convolve_op = { input, in_width, in_height,
740 in_stride, layer_config, output,
741 out_stride, th, num_workers };
742 convolve_ops[th] = convolve_op;
743 worker->hook = convolve_layer;
744 worker->data1 = &(convolve_ops[th]);
745 worker->data2 = NULL;
746
747 // Start convolving.
748 if (th == num_workers - 1) {
749 winterface->execute(worker);
750 } else {
751 winterface->launch(worker);
752 }
753 }
754
755 // Wait until all workers have finished.
756 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
757 winterface->sync(&thread_data->workers[th]);
758 }
759 }
760
get_start_shift_deconvolve(int filt_width,int stride)761 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
762 const int dif = AOMMAX(filt_width - stride, 0);
763 return dif / 2;
764 }
765
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)766 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
767 int stride, const float *gamma, const float *beta,
768 const float *mean, const float *std) {
769 assert(gamma && beta && beta && std && "batchnorm has null parameter!");
770 for (int ch = 0; ch < channels; ch++) {
771 const float ch_gamma = gamma[ch];
772 const float ch_beta = beta[ch];
773 const float ch_mean = mean[ch];
774 const float ch_std = std[ch];
775 float *image_row = image[ch];
776
777 for (int row = 0; row < height; row++) {
778 for (int col = 0; col < width; col++) {
779 image_row[col] =
780 ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
781 }
782 image_row += stride;
783 }
784 }
785 }
786
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)787 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
788 int in_stride, const CNN_LAYER_CONFIG *layer_config,
789 float **output, int out_stride) {
790 assert(layer_config->deconvolve);
791
792 const int cstep = layer_config->in_channels * layer_config->out_channels;
793
794 int out_width = 0;
795 int out_height = 0;
796 av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
797 &out_height);
798 switch (layer_config->pad) {
799 case PADDING_SAME_ZERO:
800 for (int i = 0; i < layer_config->out_channels; ++i) {
801 for (int u = 0; u < out_height; ++u) {
802 for (int v = 0; v < out_width; ++v) {
803 float sum = layer_config->bias[i];
804 for (int k = 0; k < layer_config->in_channels; ++k) {
805 int off = k * layer_config->out_channels + i;
806 for (int l = 0; l < layer_config->filter_height; ++l) {
807 const int h =
808 u - l +
809 get_start_shift_deconvolve(layer_config->filter_height,
810 layer_config->skip_height);
811 for (int m = 0; m < layer_config->filter_width;
812 ++m, off += cstep) {
813 const int w =
814 v - m +
815 get_start_shift_deconvolve(layer_config->filter_width,
816 layer_config->skip_width);
817 if ((h % layer_config->skip_height) != 0 ||
818 (w % layer_config->skip_width) != 0)
819 continue;
820 const int ii = h / layer_config->skip_height;
821 const int jj = w / layer_config->skip_width;
822 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
823 continue;
824 sum += layer_config->weights[off] *
825 input[k][ii * in_stride + jj];
826 }
827 }
828 }
829 output[i][u * out_stride + v] = sum;
830 }
831 }
832 }
833 break;
834 case PADDING_SAME_REPLICATE:
835 for (int i = 0; i < layer_config->out_channels; ++i) {
836 for (int u = 0; u < out_height; ++u) {
837 for (int v = 0; v < out_width; ++v) {
838 float sum = layer_config->bias[i];
839 for (int k = 0; k < layer_config->in_channels; ++k) {
840 int off = k * layer_config->out_channels + i;
841 for (int l = 0; l < layer_config->filter_height; ++l) {
842 const int h =
843 u - l +
844 get_start_shift_deconvolve(layer_config->filter_height,
845 layer_config->skip_height);
846 for (int m = 0; m < layer_config->filter_width;
847 ++m, off += cstep) {
848 const int w =
849 v - m +
850 get_start_shift_deconvolve(layer_config->filter_width,
851 layer_config->skip_width);
852 if ((h % layer_config->skip_height) != 0 ||
853 (w % layer_config->skip_width) != 0)
854 continue;
855 const int ii =
856 CLAMPINDEX(h / layer_config->skip_height, in_height);
857 const int jj =
858 CLAMPINDEX(w / layer_config->skip_width, in_width);
859 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
860 sum += layer_config->weights[off] *
861 input[k][ii * in_stride + jj];
862 }
863 }
864 }
865 output[i][u * out_stride + v] = sum;
866 }
867 }
868 }
869 break;
870 case PADDING_VALID:
871 for (int i = 0; i < layer_config->out_channels; ++i) {
872 for (int u = 0; u < out_height; ++u) {
873 for (int v = 0; v < out_width; ++v) {
874 float sum = layer_config->bias[i];
875 for (int k = 0; k < layer_config->in_channels; ++k) {
876 int off = k * layer_config->out_channels + i;
877 for (int l = 0; l < layer_config->filter_height; ++l) {
878 const int h = u - l;
879 for (int m = 0; m < layer_config->filter_width;
880 ++m, off += cstep) {
881 const int w = v - m;
882 if ((h % layer_config->skip_height) != 0 ||
883 (w % layer_config->skip_width) != 0)
884 continue;
885 const int ii = h / layer_config->skip_height;
886 const int jj = w / layer_config->skip_width;
887 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
888 continue;
889 sum += layer_config->weights[off] *
890 input[k][ii * in_stride + jj];
891 }
892 }
893 }
894 output[i][u * out_stride + v] = sum;
895 }
896 }
897 }
898 break;
899 default: assert(0 && "Unknown padding type");
900 }
901 }
902
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)903 bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
904 int in_stride, const CNN_CONFIG *cnn_config,
905 const CNN_THREAD_DATA *thread_data,
906 CNN_MULTI_OUT *output_struct) {
907 bool success = false;
908 TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
909 TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
910
911 float **output[CNN_MAX_BRANCHES];
912 const int *out_chs = output_struct->output_channels;
913 output[0] = output_struct->output_buffer;
914 for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
915 output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
916 }
917
918 int i_width = in_width;
919 int i_height = in_height;
920 int o_width = 0, o_height = 0;
921 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
922 init_tensor(&tensor1[b]);
923 init_tensor(&tensor2[b]);
924 }
925
926 const int *out_stride = output_struct->output_strides;
927 for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
928 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
929 const int branch = layer_config->branch;
930 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
931
932 // Allocate input tensor
933 if (layer == 0) { // First layer
934 assert(branch == 0); // First layer must be primary branch
935 assign_tensor(&tensor1[branch], (float **)input,
936 layer_config->in_channels, in_width, in_height, in_stride);
937 } else { // Non-first layer
938 // Swap tensor1 and tensor2
939 swap_tensor(&tensor1[branch], &tensor2[branch]);
940
941 i_width = tensor1[branch].width;
942 i_height = tensor1[branch].height;
943 }
944
945 // Allocate output tensor
946 av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
947 &o_height);
948 const int output_num = layer_config->output_num;
949 if (output_num == -1) { // Non-output layer
950 if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
951 o_height)) {
952 goto Error;
953 }
954 } else { // Output layer
955 free_tensor(&tensor2[branch]);
956 assign_tensor(&tensor2[branch], output[output_num],
957 layer_config->out_channels, o_width, o_height,
958 out_stride[output_num]);
959 }
960
961 // If we are combining branches make sure that the branch to combine
962 // is different from the current branch.
963 assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
964 !(branch_config->branches_to_combine & (1 << branch))));
965
966 if (layer_config->branch_copy_type == BRANCH_INPUT) {
967 if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
968 branch, tensor2)) {
969 goto Error;
970 }
971 }
972 // Check consistency of input and output channels
973 assert(tensor1[branch].channels == layer_config->in_channels);
974 assert(tensor2[branch].channels == layer_config->out_channels);
975
976 // Convolve/Deconvolve
977 if (!cnn_config->layer_config[layer].deconvolve) {
978 if (thread_data->num_workers > 1) {
979 convolve_layer_mt((const float **)tensor1[branch].buf,
980 tensor1[branch].width, tensor1[branch].height,
981 tensor1[branch].stride, layer_config, thread_data,
982 tensor2[branch].buf, tensor2[branch].stride);
983 } else {
984 av1_cnn_convolve((const float **)tensor1[branch].buf,
985 tensor1[branch].width, tensor1[branch].height,
986 tensor1[branch].stride, layer_config,
987 tensor2[branch].buf, tensor2[branch].stride, 0, 1);
988 }
989 } else {
990 av1_cnn_deconvolve((const float **)tensor1[branch].buf,
991 tensor1[branch].width, tensor1[branch].height,
992 tensor1[branch].stride, layer_config,
993 tensor2[branch].buf, tensor2[branch].stride);
994 }
995
996 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
997 if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
998 branch, tensor2)) {
999 goto Error;
1000 }
1001 }
1002
1003 // Add tensors from other branches if needed
1004 if (layer_config->branch_combine_type == BRANCH_ADD) {
1005 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1006 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1007 assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
1008 av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
1009 tensor2[branch].width, tensor2[branch].height,
1010 tensor2[branch].stride, (const float **)tensor2[b].buf);
1011 }
1012 }
1013 }
1014
1015 // Non-linearity
1016 if (layer_config->activation != IDENTITY)
1017 av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1018 tensor2[branch].width, tensor2[branch].height,
1019 tensor2[branch].stride, layer_config->activation);
1020
1021 if (layer_config->bn_params.bn_gamma) {
1022 av1_cnn_batchnorm(
1023 tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1024 tensor2[branch].height, tensor2[branch].stride,
1025 layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1026 layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1027 }
1028
1029 // Concatenate tensors
1030 if (layer_config->branch_combine_type == BRANCH_CAT) {
1031 if (output_num == -1) { // Non-output layer
1032 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1033 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1034 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1035 assert(tensor2[b].channels > 0);
1036 if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
1037 }
1038 }
1039 } else { // Output layer
1040 const int existing_channels = tensor2[branch].channels;
1041 int num_chs = existing_channels;
1042 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1043 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1044 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1045 // Needed only to assign the new channel buffers
1046 num_chs += tensor2[b].channels;
1047 }
1048 }
1049 assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1050 o_height, out_stride[output_num]);
1051
1052 num_chs = existing_channels;
1053 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1054 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1055 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1056 // Needed only to assign the new channel buffers
1057 copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1058 &tensor2[branch]);
1059 num_chs += tensor2[b].channels;
1060 }
1061 }
1062 }
1063 }
1064
1065 if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1066 if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
1067 branch, tensor2)) {
1068 goto Error;
1069 }
1070 }
1071 }
1072
1073 success = true;
1074 Error:
1075 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1076 free_tensor(&tensor1[b]);
1077 free_tensor(&tensor2[b]);
1078 }
1079 return success;
1080 }
1081
1082 // Assume output already has proper allocation
1083 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1084 bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1085 int stride, const CNN_CONFIG *cnn_config,
1086 const CNN_THREAD_DATA *thread_data,
1087 CNN_MULTI_OUT *output) {
1088 const float max_val = 255.0;
1089
1090 const int in_width = width + 2 * cnn_config->ext_width;
1091 const int in_height = height + 2 * cnn_config->ext_height;
1092 const int in_channels = cnn_config->layer_config[0].in_channels;
1093 float *inputs[CNN_MAX_CHANNELS];
1094 float *input_ =
1095 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1096 if (!input_) return false;
1097 const int in_stride = in_width;
1098
1099 for (int c = 0; c < in_channels; ++c) {
1100 inputs[c] = input_ + c * in_stride * in_height;
1101 float *input =
1102 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1103
1104 if (cnn_config->strict_bounds) {
1105 for (int i = 0; i < height; ++i)
1106 for (int j = 0; j < width; ++j)
1107 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1108 // extend left and right
1109 for (int i = 0; i < height; ++i) {
1110 for (int j = -cnn_config->ext_width; j < 0; ++j)
1111 input[i * in_stride + j] = input[i * in_stride];
1112 for (int j = width; j < width + cnn_config->ext_width; ++j)
1113 input[i * in_stride + j] = input[i * in_stride + width - 1];
1114 }
1115 // extend top and bottom
1116 for (int i = -cnn_config->ext_height; i < 0; ++i)
1117 memcpy(&input[i * in_stride - cnn_config->ext_width],
1118 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1119 for (int i = height; i < height + cnn_config->ext_height; ++i)
1120 memcpy(&input[i * in_stride - cnn_config->ext_width],
1121 &input[(height - 1) * in_stride - cnn_config->ext_width],
1122 in_width * sizeof(*input));
1123 } else {
1124 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1125 ++i)
1126 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1127 ++j)
1128 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1129 }
1130 }
1131 bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1132 in_stride, cnn_config, thread_data, output);
1133
1134 aom_free(input_);
1135 return success;
1136 }
1137
1138 // Assume output already has proper allocation
1139 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1140 bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1141 int stride,
1142 const CNN_CONFIG *cnn_config,
1143 const CNN_THREAD_DATA *thread_data,
1144 int bit_depth,
1145 CNN_MULTI_OUT *output) {
1146 const float max_val = (float)((1 << bit_depth) - 1);
1147
1148 const int in_width = width + 2 * cnn_config->ext_width;
1149 const int in_height = height + 2 * cnn_config->ext_height;
1150 const int in_channels = cnn_config->layer_config[0].in_channels;
1151 float *inputs[CNN_MAX_CHANNELS];
1152 float *input_ =
1153 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1154 if (!input_) return false;
1155 const int in_stride = in_width;
1156
1157 for (int c = 0; c < in_channels; ++c) {
1158 inputs[c] = input_ + c * in_stride * in_height;
1159 float *input =
1160 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1161
1162 if (cnn_config->strict_bounds) {
1163 for (int i = 0; i < height; ++i)
1164 for (int j = 0; j < width; ++j)
1165 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1166 // extend left and right
1167 for (int i = 0; i < height; ++i) {
1168 for (int j = -cnn_config->ext_width; j < 0; ++j)
1169 input[i * in_stride + j] = input[i * in_stride];
1170 for (int j = width; j < width + cnn_config->ext_width; ++j)
1171 input[i * in_stride + j] = input[i * in_stride + width - 1];
1172 }
1173 // extend top and bottom
1174 for (int i = -cnn_config->ext_height; i < 0; ++i)
1175 memcpy(&input[i * in_stride - cnn_config->ext_width],
1176 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1177 for (int i = height; i < height + cnn_config->ext_height; ++i)
1178 memcpy(&input[i * in_stride - cnn_config->ext_width],
1179 &input[(height - 1) * in_stride - cnn_config->ext_width],
1180 in_width * sizeof(*input));
1181 } else {
1182 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1183 ++i)
1184 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1185 ++j)
1186 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1187 }
1188 }
1189
1190 bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1191 in_stride, cnn_config, thread_data, output);
1192
1193 aom_free(input_);
1194 return success;
1195 }
1196
1197 // Assume output already has proper allocation
1198 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)1199 bool av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
1200 const CNN_CONFIG *cnn_config,
1201 const CNN_THREAD_DATA *thread_data, float **output,
1202 int out_stride) {
1203 int out_width = 0, out_height = 0, out_channels = 0;
1204 av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1205 &out_channels);
1206 const int output_chs[1] = { out_channels };
1207 const int output_strides[1] = { out_stride };
1208 CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1209 .output_strides = output_strides,
1210 .output_buffer = output };
1211 return av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
1212 thread_data, &output_struct);
1213 }
1214
1215 // Assume output already has proper allocation
1216 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,float ** output,int out_stride)1217 bool av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
1218 int stride, const CNN_CONFIG *cnn_config,
1219 const CNN_THREAD_DATA *thread_data,
1220 int bit_depth, float **output, int out_stride) {
1221 int out_width = 0, out_height = 0, out_channels = 0;
1222 av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1223 &out_channels);
1224 const int output_chs[1] = { out_channels };
1225 const int output_strides[1] = { out_stride };
1226 CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1227 .output_strides = output_strides,
1228 .output_buffer = output };
1229 return av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride,
1230 cnn_config, thread_data,
1231 bit_depth, &output_struct);
1232 }
1233