1 /*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <math.h>
14 #include <stdbool.h>
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "av1/common/av1_common_int.h"
18 #include "av1/encoder/cnn.h"
19
20 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
21
22 typedef struct {
23 const float **input;
24 int in_width;
25 int in_height;
26 int in_stride;
27 const CNN_LAYER_CONFIG *layer_config;
28 float **output;
29 int out_stride;
30 int start_idx;
31 int th_step;
32 } CONVOLVE_OPS;
33
softsign(float x)34 static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
35
relu(float x)36 static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
37
38 typedef struct {
39 int allocsize;
40 int channels;
41 int width, height, stride;
42 float *buf[CNN_MAX_CHANNELS];
43 } TENSOR;
44
init_tensor(TENSOR * tensor)45 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
46
free_tensor(TENSOR * tensor)47 static void free_tensor(TENSOR *tensor) {
48 if (tensor->allocsize) {
49 aom_free(tensor->buf[0]);
50 tensor->buf[0] = NULL;
51 tensor->allocsize = 0;
52 }
53 }
54
realloc_tensor(TENSOR * tensor,int channels,int width,int height)55 static bool realloc_tensor(TENSOR *tensor, int channels, int width,
56 int height) {
57 const int newallocsize = channels * width * height;
58 if (tensor->allocsize < newallocsize) {
59 free_tensor(tensor);
60 tensor->buf[0] =
61 (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
62 if (!tensor->buf[0]) return false;
63 tensor->allocsize = newallocsize;
64 }
65 tensor->width = width;
66 tensor->height = height;
67 tensor->stride = width;
68 tensor->channels = channels;
69 for (int c = 1; c < channels; ++c)
70 tensor->buf[c] = &tensor->buf[0][c * width * height];
71 return true;
72 }
73
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)74 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
75 TENSOR *dst) {
76 assert(src->width == dst->width);
77 assert(src->height == dst->height);
78 assert(copy_channels <= src->channels);
79 if (src->stride == dst->width && dst->stride == dst->width) {
80 for (int c = 0; c < copy_channels; ++c) {
81 memcpy(dst->buf[dst_offset + c], src->buf[c],
82 sizeof(*dst->buf[0]) * src->width * src->height);
83 }
84 } else {
85 for (int c = 0; c < copy_channels; ++c) {
86 for (int r = 0; r < dst->height; ++r) {
87 memcpy(&dst->buf[dst_offset + c][r * dst->stride],
88 &src->buf[c][r * src->stride],
89 dst->width * sizeof(*dst->buf[c]));
90 }
91 }
92 }
93 }
94
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)95 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
96 int channels, int width, int height, int stride) {
97 tensor->allocsize = 0;
98 tensor->channels = channels;
99 tensor->width = width;
100 tensor->height = height;
101 tensor->stride = stride;
102 if (buf) {
103 for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
104 } else {
105 for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
106 }
107 }
108
swap_tensor(TENSOR * t1,TENSOR * t2)109 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
110 TENSOR t = *t1;
111 *t1 = *t2;
112 *t2 = t;
113 }
114
115 // The concatenated tensor goes into dst with first the channels in
116 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)117 static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
118 assert(src->width == dst->width);
119 assert(src->height == dst->height);
120
121 const int dst_channels = dst->channels;
122 const int channels = dst->channels + src->channels;
123 const int newallocsize = channels * dst->width * dst->height;
124 if (dst->allocsize < newallocsize) {
125 TENSOR t;
126 init_tensor(&t);
127 // allocate new buffers and copy first the dst channels
128 if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
129 copy_tensor(dst, dst->channels, 0, &t);
130 // Swap the tensors and free the old buffers
131 swap_tensor(dst, &t);
132 free_tensor(&t);
133 }
134 for (int c = 1; c < channels; ++c)
135 dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
136 // Copy the channels in src after the first dst_channels channels.
137 copy_tensor(src, src->channels, dst_channels, dst);
138 return true;
139 }
140
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)141 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
142 return (t1->width == t2->width && t1->height == t2->height);
143 }
144
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
146 return (t1->channels == t2->channels && t1->width == t2->width &&
147 t1->height == t2->height);
148 }
149
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)150 void av1_find_cnn_layer_output_size(int in_width, int in_height,
151 const CNN_LAYER_CONFIG *layer_config,
152 int *out_width, int *out_height) {
153 assert(layer_config->skip_width > 0);
154 assert(layer_config->skip_height > 0);
155 if (!layer_config->deconvolve) {
156 switch (layer_config->pad) {
157 case PADDING_SAME_ZERO:
158 case PADDING_SAME_REPLICATE:
159 *out_width = (in_width + layer_config->skip_width - 1) /
160 layer_config->skip_width;
161 *out_height = (in_height + layer_config->skip_height - 1) /
162 layer_config->skip_height;
163 break;
164 case PADDING_VALID:
165 *out_width =
166 (in_width - layer_config->filter_width + layer_config->skip_width) /
167 layer_config->skip_width;
168 *out_height = (in_height - layer_config->filter_height +
169 layer_config->skip_height) /
170 layer_config->skip_height;
171 break;
172 default: assert(0 && "Unknown padding type");
173 }
174 } else {
175 switch (layer_config->pad) {
176 case PADDING_SAME_ZERO:
177 case PADDING_SAME_REPLICATE:
178 *out_width = in_width * layer_config->skip_width;
179 *out_height = in_height * layer_config->skip_height;
180 break;
181 case PADDING_VALID:
182 *out_width = (in_width - 1) * layer_config->skip_width +
183 layer_config->filter_width;
184 *out_height = (in_height - 1) * layer_config->skip_height +
185 layer_config->filter_height;
186 break;
187 default: assert(0 && "Unknown padding type");
188 }
189 }
190 }
191
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])192 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
193 int channels_per_branch[]) {
194 int branch = layer_config->branch;
195 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
196 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
197 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
198 if (layer_config->branch_copy_type == BRANCH_INPUT) {
199 channels_per_branch[b] = layer_config->in_channels;
200 } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
201 channels_per_branch[b] = layer_config->out_channels;
202 } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
203 channels_per_branch[b] = layer_config->out_channels;
204 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
205 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
206 assert(channels_per_branch[c] > 0);
207 channels_per_branch[b] += channels_per_branch[c];
208 }
209 }
210 }
211 }
212 }
213 channels_per_branch[branch] = layer_config->out_channels;
214 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
215 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
216 assert(channels_per_branch[c] > 0);
217 channels_per_branch[branch] += channels_per_branch[c];
218 }
219 }
220 }
221
222 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)223 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
224 const int num_layers = cnn_config->num_layers;
225 const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
226
227 for (int idx = 0; idx < num_layers; idx++) {
228 if (layer_configs[idx].output_num != -1) {
229 return 1;
230 }
231 }
232 return 0;
233 }
234 #endif
235
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)236 void av1_find_cnn_output_size(int in_width, int in_height,
237 const CNN_CONFIG *cnn_config, int *out_width,
238 int *out_height, int *out_channels) {
239 int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
240 int i_width[CNN_MAX_BRANCHES] = { 0 };
241 int i_height[CNN_MAX_BRANCHES] = { 0 };
242 i_width[0] = in_width + cnn_config->ext_width * 2;
243 i_height[0] = in_height + cnn_config->ext_height * 2;
244
245 #if CONFIG_DEBUG
246 assert(cnn_has_at_least_one_output(cnn_config));
247 #endif
248
249 for (int i = 0; i < cnn_config->num_layers; ++i) {
250 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
251 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
252 const int branch = layer_config->branch;
253 int o_width = 0, o_height = 0;
254
255 if (layer_config->branch_copy_type == BRANCH_INPUT) {
256 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
257 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
258 assert(i_width[branch] > 0 && i_height[branch] > 0);
259 i_width[b] = i_width[branch];
260 i_height[b] = i_height[branch];
261 }
262 }
263 }
264
265 av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
266 layer_config, &o_width, &o_height);
267 i_width[branch] = o_width;
268 i_height[branch] = o_height;
269
270 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
271 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
272 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
273 i_width[b] = o_width;
274 i_height[b] = o_height;
275 }
276 }
277 }
278
279 find_cnn_out_channels(layer_config, channels_per_branch);
280
281 const int output_num = layer_config->output_num;
282 if (output_num != -1) { // Current layer is an output layer
283 out_width[output_num] = o_width;
284 out_height[output_num] = o_height;
285 out_channels[output_num] = channels_per_branch[layer_config->branch];
286 }
287 }
288 }
289
get_start_shift_convolve(int width,int filt_width,int stride)290 static INLINE int get_start_shift_convolve(int width, int filt_width,
291 int stride) {
292 const int mod = (width % stride);
293 const int filt_off = (filt_width - 1) / 2;
294 const int dif = (mod ? mod - 1 : stride - 1);
295 return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
296 }
297
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)298 void av1_cnn_add_c(float **output, int channels, int width, int height,
299 int stride, const float **add) {
300 for (int c = 0; c < channels; ++c) {
301 for (int i = 0; i < height; ++i)
302 for (int j = 0; j < width; ++j)
303 output[c][i * stride + j] += add[c][i * stride + j];
304 }
305 }
306
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)307 void av1_cnn_activate_c(float **output, int channels, int width, int height,
308 int stride, ACTIVATION layer_activation) {
309 if (layer_activation == RELU) {
310 for (int c = 0; c < channels; ++c) {
311 for (int i = 0; i < height; ++i)
312 for (int j = 0; j < width; ++j)
313 output[c][i * stride + j] = relu(output[c][i * stride + j]);
314 }
315 } else if (layer_activation == SOFTSIGN) {
316 for (int c = 0; c < channels; ++c) {
317 for (int i = 0; i < height; ++i)
318 for (int j = 0; j < width; ++j)
319 output[c][i * stride + j] = softsign(output[c][i * stride + j]);
320 }
321 } else if (layer_activation == SIGMOID) {
322 assert(0 && "Sigmoid has not been supported in CNN."); // TO DO
323 } else if (layer_activation != NONE) {
324 assert(0 && "Unknown activation type");
325 }
326 }
327
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])328 static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
329 const CNN_LAYER_CONFIG *layer_config,
330 int branch, TENSOR branch_output[]) {
331 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
332 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
333 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
334 // Copy layer's active tensor to output tensor of branch b if set in
335 // mask. The output becomes the input of the first layer of the branch
336 // because the layer of the branch is not the first layer.
337 int copy_channels = branch_config->channels_to_copy > 0
338 ? branch_config->channels_to_copy
339 : layer_active_tensor->channels;
340 if (!realloc_tensor(&branch_output[b], copy_channels,
341 layer_active_tensor->width,
342 layer_active_tensor->height)) {
343 return false;
344 }
345 copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
346 }
347 }
348 return true;
349 }
350
351 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
352 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)353 static void convolve_maxpool_padding_zero(
354 const float **input, int in_width, int in_height, int in_stride,
355 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
356 const int cstep, const int filter_width_half,
357 const int filter_height_half) {
358 for (int i = 0; i < layer_config->out_channels; ++i) {
359 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
360 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
361 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
362 ++hh) {
363 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
364 ++ww) {
365 float sum = layer_config->bias[i];
366 for (int k = 0; k < layer_config->in_channels; ++k) {
367 int off = k * layer_config->out_channels + i;
368 for (int l = 0; l < layer_config->filter_height; ++l) {
369 const int ii = hh + l - filter_height_half;
370 for (int m = 0; m < layer_config->filter_width;
371 ++m, off += cstep) {
372 const int jj = ww + m - filter_width_half;
373 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
374 continue;
375 sum += layer_config->weights[off] *
376 input[k][ii * in_stride + jj];
377 }
378 }
379 }
380 const float a = sum;
381 if (h == hh && w == ww)
382 output[i][u * out_stride + v] = a;
383 else
384 output[i][u * out_stride + v] =
385 AOMMAX(output[i][u * out_stride + v], a);
386 }
387 }
388 }
389 }
390 }
391 }
392
393 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
394 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)395 static void convolve_maxpool_padding_replicate(
396 const float **input, int in_width, int in_height, int in_stride,
397 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
398 const int cstep, const int filter_width_half,
399 const int filter_height_half) {
400 for (int i = 0; i < layer_config->out_channels; ++i) {
401 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
402 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
403 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
404 ++hh) {
405 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
406 ++ww) {
407 float sum = layer_config->bias[i];
408 for (int k = 0; k < layer_config->in_channels; ++k) {
409 int off = k * layer_config->out_channels + i;
410 for (int l = 0; l < layer_config->filter_height; ++l) {
411 const int ii =
412 CLAMPINDEX(hh + l - filter_height_half, in_height);
413 for (int m = 0; m < layer_config->filter_width;
414 ++m, off += cstep) {
415 const int jj =
416 CLAMPINDEX(ww + m - filter_width_half, in_width);
417 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
418 sum += layer_config->weights[off] *
419 input[k][ii * in_stride + jj];
420 }
421 }
422 }
423 const float a = sum;
424 if (h == hh && w == ww)
425 output[i][u * out_stride + v] = a;
426 else
427 output[i][u * out_stride + v] =
428 AOMMAX(output[i][u * out_stride + v], a);
429 }
430 }
431 }
432 }
433 }
434 }
435
436 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
437 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)438 static void convolve_maxpool_padding_valid(
439 const float **input, int in_width, int in_height, int in_stride,
440 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
441 const int cstep) {
442 for (int i = 0; i < layer_config->out_channels; ++i) {
443 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
444 h += layer_config->skip_height, ++u) {
445 for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
446 w += layer_config->skip_width, ++v) {
447 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
448 ++hh) {
449 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
450 ++ww) {
451 float sum = layer_config->bias[i];
452 for (int k = 0; k < layer_config->in_channels; ++k) {
453 int off = k * layer_config->out_channels + i;
454 for (int l = 0; l < layer_config->filter_height; ++l) {
455 const int ii = hh + l;
456 for (int m = 0; m < layer_config->filter_width;
457 ++m, off += cstep) {
458 const int jj = ww + m;
459 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
460 sum += layer_config->weights[off] *
461 input[k][ii * in_stride + jj];
462 }
463 }
464 }
465 const float a = sum;
466 if (h == hh && w == ww)
467 output[i][u * out_stride + v] = a;
468 else
469 output[i][u * out_stride + v] =
470 AOMMAX(output[i][u * out_stride + v], a);
471 }
472 }
473 }
474 }
475 }
476 }
477
478 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
479 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)480 static void convolve_element_wise(const float **input, int in_width,
481 int in_height, int in_stride,
482 const CNN_LAYER_CONFIG *const layer_config,
483 float **output, int out_stride, int start_idx,
484 int step) {
485 const int start_h = get_start_shift_convolve(
486 in_height, layer_config->filter_height, layer_config->skip_height);
487 const int start_w =
488 get_start_shift_convolve(in_width, layer_config->filter_width,
489 layer_config->skip_width) +
490 start_idx * layer_config->skip_width;
491 const int out_w_step = AOMMAX(step, 1);
492 const int in_w_step = layer_config->skip_width * out_w_step;
493 for (int i = 0; i < layer_config->out_channels; ++i) {
494 for (int h = start_h, u = 0; h < in_height;
495 h += layer_config->skip_height, ++u) {
496 const int in_h = h * in_stride;
497 const int out_h = u * out_stride + start_idx;
498 for (int w = start_w, out_index = out_h; w < in_width;
499 w += in_w_step, out_index += out_w_step) {
500 float sum = layer_config->bias[i];
501 for (int k = 0; k < layer_config->in_channels; ++k) {
502 sum += layer_config->weights[k * layer_config->out_channels + i] *
503 input[k][in_h + w];
504 }
505 output[i][out_index] = sum;
506 }
507 }
508 }
509 }
510
511 // CNNConvolve specific to maxpool set as 0 and padding equal to
512 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)513 static void convolve_no_maxpool_padding_zero(
514 const float **input, int in_width, int in_height, int in_stride,
515 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
516 int start_idx, const int cstep, const int filter_width_half,
517 const int filter_height_half, const int ii_shift, const int jj_shift,
518 const int channel_step) {
519 const int start_h = get_start_shift_convolve(
520 in_height, layer_config->filter_height, layer_config->skip_height);
521 const int start_w = get_start_shift_convolve(
522 in_width, layer_config->filter_width, layer_config->skip_width);
523 const int end_ii_shift = filter_height_half + 1;
524 const int end_jj_shift = filter_width_half + 1;
525 // *_filter_margin stores the number of pixels along a dimension in the
526 // intersection of the complement of the image in the extended image
527 // and the filter.
528 const int top_filter_margin = layer_config->filter_width * ii_shift;
529 const int right_filter_margin = end_jj_shift - in_width;
530 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
531 for (int h = start_h, u = 0; h < in_height;
532 h += layer_config->skip_height, ++u) {
533 const int out_h = u * out_stride;
534 const int top_cstep =
535 AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
536 cstep +
537 i;
538 const int start_ii = AOMMAX(0, h - ii_shift);
539 const int end_ii = AOMMIN(in_height, h + end_ii_shift);
540 for (int w = start_w, out_index = out_h; w < in_width;
541 w += layer_config->skip_width, ++out_index) {
542 const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
543 const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
544 const int start_jj = AOMMAX(0, w - jj_shift);
545 const int end_jj = AOMMIN(in_width, w + end_jj_shift);
546 float sum = layer_config->bias[i];
547 for (int k = 0; k < layer_config->in_channels; ++k) {
548 int off = k * layer_config->out_channels + top_cstep;
549 for (int ii = start_ii; ii < end_ii; ++ii) {
550 off += left_cstep;
551 for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
552 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
553 }
554 off += right_cstep;
555 }
556 }
557 output[i][out_index] = sum;
558 }
559 }
560 }
561 }
562
563 // CNNConvolve specific to maxpool set as 0 and padding equal to
564 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)565 static void convolve_no_maxpool_padding_replicate(
566 const float **input, int in_width, int in_height, int in_stride,
567 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
568 int start_idx, const int cstep, const int ii_shift, const int jj_shift,
569 const int channel_step) {
570 // h and w are shifted to an offset coordinate system to reduce in-loop
571 // computation.
572 const int start_h =
573 get_start_shift_convolve(in_height, layer_config->filter_height,
574 layer_config->skip_height) -
575 ii_shift;
576 const int start_w =
577 get_start_shift_convolve(in_width, layer_config->filter_width,
578 layer_config->skip_width) -
579 jj_shift;
580 const int end_h = in_height - ii_shift;
581 const int end_w = in_width - jj_shift;
582 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
583 for (int h = start_h, u = 0; h < end_h;
584 h += layer_config->skip_height, ++u) {
585 const int out_h = u * out_stride;
586 const int upper_ii_index = layer_config->filter_height + h;
587 for (int w = start_w, out_index = out_h; w < end_w;
588 w += layer_config->skip_width, ++out_index) {
589 const int upper_jj_index = layer_config->filter_width + w;
590 float sum = layer_config->bias[i];
591 for (int k = 0; k < layer_config->in_channels; ++k) {
592 int off = k * layer_config->out_channels + i;
593 for (int ii = h; ii < upper_ii_index; ++ii) {
594 const int clamped_ii = CLAMPINDEX(ii, in_height);
595 for (int jj = w; jj < upper_jj_index; ++jj) {
596 const int clamped_jj = CLAMPINDEX(jj, in_width);
597 assert(clamped_ii >= 0 && clamped_ii < in_height &&
598 clamped_jj >= 0 && clamped_jj < in_width);
599 sum += layer_config->weights[off] *
600 input[k][clamped_ii * in_stride + clamped_jj];
601 off += cstep;
602 }
603 }
604 }
605 output[i][out_index] = sum;
606 }
607 }
608 }
609 }
610
611 // CNNConvolve specific to maxpool set as 0 and padding equal to
612 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)613 void av1_cnn_convolve_no_maxpool_padding_valid_c(
614 const float **input, int in_width, int in_height, int in_stride,
615 const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
616 int start_idx, int cstep, int channel_step) {
617 assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
618 !layer_config->maxpool);
619 assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
620 assert(layer_config->pad == PADDING_VALID);
621 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
622 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
623 h += layer_config->skip_height, ++u) {
624 const int out_h = u * out_stride;
625 const int upper_ii_index = layer_config->filter_height + h;
626 for (int w = 0, out_index = out_h;
627 w < in_width - layer_config->filter_width + 1;
628 w += layer_config->skip_width, ++out_index) {
629 const int upper_jj_index = layer_config->filter_width + w;
630 float sum = layer_config->bias[i];
631 for (int k = 0; k < layer_config->in_channels; ++k) {
632 int off = k * layer_config->out_channels + i;
633 for (int ii = h; ii < upper_ii_index; ++ii) {
634 for (int jj = w; jj < upper_jj_index; ++jj) {
635 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
636 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
637 off += cstep;
638 }
639 }
640 }
641 output[i][out_index] = sum;
642 }
643 }
644 }
645 }
646
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)647 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
648 int in_stride,
649 const CNN_LAYER_CONFIG *layer_config,
650 float **output, int out_stride, int start_idx,
651 int step) {
652 assert(!layer_config->deconvolve);
653 const int cstep = layer_config->in_channels * layer_config->out_channels;
654 const int filter_height_half = layer_config->filter_height >> 1;
655 const int filter_width_half = layer_config->filter_width >> 1;
656 const int channel_step = AOMMAX(step, 1);
657
658 if (layer_config->maxpool &&
659 (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
660 switch (layer_config->pad) {
661 case PADDING_SAME_ZERO:
662 convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
663 layer_config, output, out_stride, cstep,
664 filter_width_half, filter_height_half);
665 break;
666 case PADDING_SAME_REPLICATE:
667 convolve_maxpool_padding_replicate(
668 input, in_width, in_height, in_stride, layer_config, output,
669 out_stride, cstep, filter_width_half, filter_height_half);
670 break;
671 case PADDING_VALID:
672 convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
673 layer_config, output, out_stride, cstep);
674 break;
675 default: assert(0 && "Unknown padding type");
676 }
677 } else {
678 // Results in element-wise matrix multiplication.
679 if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
680 convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
681 output, out_stride, start_idx, step);
682 return;
683 }
684 const int ii_shift =
685 filter_height_half - (layer_config->filter_height - 1) % 2;
686 const int jj_shift =
687 filter_width_half - (layer_config->filter_width - 1) % 2;
688 switch (layer_config->pad) {
689 case PADDING_SAME_ZERO:
690 convolve_no_maxpool_padding_zero(
691 input, in_width, in_height, in_stride, layer_config, output,
692 out_stride, start_idx, cstep, filter_width_half, filter_height_half,
693 ii_shift, jj_shift, channel_step);
694 break;
695 case PADDING_SAME_REPLICATE:
696 convolve_no_maxpool_padding_replicate(
697 input, in_width, in_height, in_stride, layer_config, output,
698 out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
699 break;
700 case PADDING_VALID:
701 av1_cnn_convolve_no_maxpool_padding_valid(
702 input, in_width, in_height, in_stride, layer_config, output,
703 out_stride, start_idx, cstep, channel_step);
704 break;
705 default: assert(0 && "Unknown padding type");
706 }
707 }
708 }
709
convolve_layer(void * arg1,void * arg2)710 static int convolve_layer(void *arg1, void *arg2) {
711 const CONVOLVE_OPS *convolve_ops = arg1;
712 (void)arg2;
713 av1_cnn_convolve(
714 convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
715 convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
716 convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
717 return 1;
718 }
719
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)720 static void convolve_layer_mt(const float **input, int in_width, int in_height,
721 int in_stride,
722 const CNN_LAYER_CONFIG *layer_config,
723 const CNN_THREAD_DATA *thread_data,
724 float **output, int out_stride) {
725 const AVxWorkerInterface *const winterface = aom_get_worker_interface();
726 const int num_workers = thread_data->num_workers;
727 assert(thread_data->workers);
728
729 CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
730 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
731 AVxWorker *const worker = &thread_data->workers[th];
732 winterface->reset(worker);
733
734 CONVOLVE_OPS convolve_op = { input, in_width, in_height,
735 in_stride, layer_config, output,
736 out_stride, th, num_workers };
737 convolve_ops[th] = convolve_op;
738 worker->hook = convolve_layer;
739 worker->data1 = &(convolve_ops[th]);
740 worker->data2 = NULL;
741
742 // Start convolving.
743 if (th == num_workers - 1) {
744 winterface->execute(worker);
745 } else {
746 winterface->launch(worker);
747 }
748 }
749
750 // Wait until all workers have finished.
751 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
752 winterface->sync(&thread_data->workers[th]);
753 }
754 }
755
get_start_shift_deconvolve(int filt_width,int stride)756 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
757 const int dif = AOMMAX(filt_width - stride, 0);
758 return dif / 2;
759 }
760
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)761 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
762 int stride, const float *gamma, const float *beta,
763 const float *mean, const float *std) {
764 assert(gamma && beta && beta && std && "batchnorm has null parameter!");
765 for (int ch = 0; ch < channels; ch++) {
766 const float ch_gamma = gamma[ch];
767 const float ch_beta = beta[ch];
768 const float ch_mean = mean[ch];
769 const float ch_std = std[ch];
770 float *image_row = image[ch];
771
772 for (int row = 0; row < height; row++) {
773 for (int col = 0; col < width; col++) {
774 image_row[col] =
775 ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
776 }
777 image_row += stride;
778 }
779 }
780 }
781
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)782 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
783 int in_stride, const CNN_LAYER_CONFIG *layer_config,
784 float **output, int out_stride) {
785 assert(layer_config->deconvolve);
786
787 const int cstep = layer_config->in_channels * layer_config->out_channels;
788
789 int out_width = 0;
790 int out_height = 0;
791 av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
792 &out_height);
793 switch (layer_config->pad) {
794 case PADDING_SAME_ZERO:
795 for (int i = 0; i < layer_config->out_channels; ++i) {
796 for (int u = 0; u < out_height; ++u) {
797 for (int v = 0; v < out_width; ++v) {
798 float sum = layer_config->bias[i];
799 for (int k = 0; k < layer_config->in_channels; ++k) {
800 int off = k * layer_config->out_channels + i;
801 for (int l = 0; l < layer_config->filter_height; ++l) {
802 const int h =
803 u - l +
804 get_start_shift_deconvolve(layer_config->filter_height,
805 layer_config->skip_height);
806 for (int m = 0; m < layer_config->filter_width;
807 ++m, off += cstep) {
808 const int w =
809 v - m +
810 get_start_shift_deconvolve(layer_config->filter_width,
811 layer_config->skip_width);
812 if ((h % layer_config->skip_height) != 0 ||
813 (w % layer_config->skip_width) != 0)
814 continue;
815 const int ii = h / layer_config->skip_height;
816 const int jj = w / layer_config->skip_width;
817 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
818 continue;
819 sum += layer_config->weights[off] *
820 input[k][ii * in_stride + jj];
821 }
822 }
823 }
824 output[i][u * out_stride + v] = sum;
825 }
826 }
827 }
828 break;
829 case PADDING_SAME_REPLICATE:
830 for (int i = 0; i < layer_config->out_channels; ++i) {
831 for (int u = 0; u < out_height; ++u) {
832 for (int v = 0; v < out_width; ++v) {
833 float sum = layer_config->bias[i];
834 for (int k = 0; k < layer_config->in_channels; ++k) {
835 int off = k * layer_config->out_channels + i;
836 for (int l = 0; l < layer_config->filter_height; ++l) {
837 const int h =
838 u - l +
839 get_start_shift_deconvolve(layer_config->filter_height,
840 layer_config->skip_height);
841 for (int m = 0; m < layer_config->filter_width;
842 ++m, off += cstep) {
843 const int w =
844 v - m +
845 get_start_shift_deconvolve(layer_config->filter_width,
846 layer_config->skip_width);
847 if ((h % layer_config->skip_height) != 0 ||
848 (w % layer_config->skip_width) != 0)
849 continue;
850 const int ii =
851 CLAMPINDEX(h / layer_config->skip_height, in_height);
852 const int jj =
853 CLAMPINDEX(w / layer_config->skip_width, in_width);
854 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
855 sum += layer_config->weights[off] *
856 input[k][ii * in_stride + jj];
857 }
858 }
859 }
860 output[i][u * out_stride + v] = sum;
861 }
862 }
863 }
864 break;
865 case PADDING_VALID:
866 for (int i = 0; i < layer_config->out_channels; ++i) {
867 for (int u = 0; u < out_height; ++u) {
868 for (int v = 0; v < out_width; ++v) {
869 float sum = layer_config->bias[i];
870 for (int k = 0; k < layer_config->in_channels; ++k) {
871 int off = k * layer_config->out_channels + i;
872 for (int l = 0; l < layer_config->filter_height; ++l) {
873 const int h = u - l;
874 for (int m = 0; m < layer_config->filter_width;
875 ++m, off += cstep) {
876 const int w = v - m;
877 if ((h % layer_config->skip_height) != 0 ||
878 (w % layer_config->skip_width) != 0)
879 continue;
880 const int ii = h / layer_config->skip_height;
881 const int jj = w / layer_config->skip_width;
882 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
883 continue;
884 sum += layer_config->weights[off] *
885 input[k][ii * in_stride + jj];
886 }
887 }
888 }
889 output[i][u * out_stride + v] = sum;
890 }
891 }
892 }
893 break;
894 default: assert(0 && "Unknown padding type");
895 }
896 }
897
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)898 bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
899 int in_stride, const CNN_CONFIG *cnn_config,
900 const CNN_THREAD_DATA *thread_data,
901 CNN_MULTI_OUT *output_struct) {
902 bool success = false;
903 TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
904 TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
905
906 float **output[CNN_MAX_BRANCHES];
907 const int *out_chs = output_struct->output_channels;
908 output[0] = output_struct->output_buffer;
909 for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
910 output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
911 }
912
913 int i_width = in_width;
914 int i_height = in_height;
915 int o_width = 0, o_height = 0;
916 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
917 init_tensor(&tensor1[b]);
918 init_tensor(&tensor2[b]);
919 }
920
921 const int *out_stride = output_struct->output_strides;
922 for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
923 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
924 const int branch = layer_config->branch;
925 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
926
927 // Allocate input tensor
928 if (layer == 0) { // First layer
929 assert(branch == 0); // First layer must be primary branch
930 assign_tensor(&tensor1[branch], (float **)input,
931 layer_config->in_channels, in_width, in_height, in_stride);
932 } else { // Non-first layer
933 // Swap tensor1 and tensor2
934 swap_tensor(&tensor1[branch], &tensor2[branch]);
935
936 i_width = tensor1[branch].width;
937 i_height = tensor1[branch].height;
938 }
939
940 // Allocate output tensor
941 av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
942 &o_height);
943 const int output_num = layer_config->output_num;
944 if (output_num == -1) { // Non-output layer
945 if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
946 o_height)) {
947 goto Error;
948 }
949 } else { // Output layer
950 free_tensor(&tensor2[branch]);
951 assign_tensor(&tensor2[branch], output[output_num],
952 layer_config->out_channels, o_width, o_height,
953 out_stride[output_num]);
954 }
955
956 // If we are combining branches make sure that the branch to combine
957 // is different from the current branch.
958 assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
959 !(branch_config->branches_to_combine & (1 << branch))));
960
961 if (layer_config->branch_copy_type == BRANCH_INPUT) {
962 if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
963 branch, tensor2)) {
964 goto Error;
965 }
966 }
967 // Check consistency of input and output channels
968 assert(tensor1[branch].channels == layer_config->in_channels);
969 assert(tensor2[branch].channels == layer_config->out_channels);
970
971 // Convolve/Deconvolve
972 if (!cnn_config->layer_config[layer].deconvolve) {
973 if (thread_data->num_workers > 1) {
974 convolve_layer_mt((const float **)tensor1[branch].buf,
975 tensor1[branch].width, tensor1[branch].height,
976 tensor1[branch].stride, layer_config, thread_data,
977 tensor2[branch].buf, tensor2[branch].stride);
978 } else {
979 av1_cnn_convolve((const float **)tensor1[branch].buf,
980 tensor1[branch].width, tensor1[branch].height,
981 tensor1[branch].stride, layer_config,
982 tensor2[branch].buf, tensor2[branch].stride, 0, 1);
983 }
984 } else {
985 av1_cnn_deconvolve((const float **)tensor1[branch].buf,
986 tensor1[branch].width, tensor1[branch].height,
987 tensor1[branch].stride, layer_config,
988 tensor2[branch].buf, tensor2[branch].stride);
989 }
990
991 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
992 if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
993 branch, tensor2)) {
994 goto Error;
995 }
996 }
997
998 // Add tensors from other branches if needed
999 if (layer_config->branch_combine_type == BRANCH_ADD) {
1000 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1001 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1002 assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
1003 av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
1004 tensor2[branch].width, tensor2[branch].height,
1005 tensor2[branch].stride, (const float **)tensor2[b].buf);
1006 }
1007 }
1008 }
1009
1010 // Non-linearity
1011 av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1012 tensor2[branch].width, tensor2[branch].height,
1013 tensor2[branch].stride, layer_config->activation);
1014
1015 if (layer_config->bn_params.bn_gamma) {
1016 av1_cnn_batchnorm(
1017 tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1018 tensor2[branch].height, tensor2[branch].stride,
1019 layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1020 layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1021 }
1022
1023 // Concatenate tensors
1024 if (layer_config->branch_combine_type == BRANCH_CAT) {
1025 if (output_num == -1) { // Non-output layer
1026 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1027 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1028 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1029 assert(tensor2[b].channels > 0);
1030 if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
1031 }
1032 }
1033 } else { // Output layer
1034 const int existing_channels = tensor2[branch].channels;
1035 int num_chs = existing_channels;
1036 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1037 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1038 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1039 // Needed only to assign the new channel buffers
1040 num_chs += tensor2[b].channels;
1041 }
1042 }
1043 assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1044 o_height, out_stride[output_num]);
1045
1046 num_chs = existing_channels;
1047 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1048 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1049 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1050 // Needed only to assign the new channel buffers
1051 copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1052 &tensor2[branch]);
1053 num_chs += tensor2[b].channels;
1054 }
1055 }
1056 }
1057 }
1058
1059 if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1060 if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
1061 branch, tensor2)) {
1062 goto Error;
1063 }
1064 }
1065 }
1066
1067 success = true;
1068 Error:
1069 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1070 free_tensor(&tensor1[b]);
1071 free_tensor(&tensor2[b]);
1072 }
1073 return success;
1074 }
1075
1076 // Assume output already has proper allocation
1077 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1078 bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1079 int stride, const CNN_CONFIG *cnn_config,
1080 const CNN_THREAD_DATA *thread_data,
1081 CNN_MULTI_OUT *output) {
1082 const float max_val = 255.0;
1083
1084 const int in_width = width + 2 * cnn_config->ext_width;
1085 const int in_height = height + 2 * cnn_config->ext_height;
1086 const int in_channels = cnn_config->layer_config[0].in_channels;
1087 float *inputs[CNN_MAX_CHANNELS];
1088 float *input_ =
1089 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1090 if (!input_) return false;
1091 const int in_stride = in_width;
1092
1093 for (int c = 0; c < in_channels; ++c) {
1094 inputs[c] = input_ + c * in_stride * in_height;
1095 float *input =
1096 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1097
1098 if (cnn_config->strict_bounds) {
1099 for (int i = 0; i < height; ++i)
1100 for (int j = 0; j < width; ++j)
1101 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1102 // extend left and right
1103 for (int i = 0; i < height; ++i) {
1104 for (int j = -cnn_config->ext_width; j < 0; ++j)
1105 input[i * in_stride + j] = input[i * in_stride];
1106 for (int j = width; j < width + cnn_config->ext_width; ++j)
1107 input[i * in_stride + j] = input[i * in_stride + width - 1];
1108 }
1109 // extend top and bottom
1110 for (int i = -cnn_config->ext_height; i < 0; ++i)
1111 memcpy(&input[i * in_stride - cnn_config->ext_width],
1112 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1113 for (int i = height; i < height + cnn_config->ext_height; ++i)
1114 memcpy(&input[i * in_stride - cnn_config->ext_width],
1115 &input[(height - 1) * in_stride - cnn_config->ext_width],
1116 in_width * sizeof(*input));
1117 } else {
1118 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1119 ++i)
1120 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1121 ++j)
1122 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1123 }
1124 }
1125 bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1126 in_stride, cnn_config, thread_data, output);
1127
1128 aom_free(input_);
1129 return success;
1130 }
1131
1132 // Assume output already has proper allocation
1133 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1134 bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1135 int stride,
1136 const CNN_CONFIG *cnn_config,
1137 const CNN_THREAD_DATA *thread_data,
1138 int bit_depth,
1139 CNN_MULTI_OUT *output) {
1140 const float max_val = (float)((1 << bit_depth) - 1);
1141
1142 const int in_width = width + 2 * cnn_config->ext_width;
1143 const int in_height = height + 2 * cnn_config->ext_height;
1144 const int in_channels = cnn_config->layer_config[0].in_channels;
1145 float *inputs[CNN_MAX_CHANNELS];
1146 float *input_ =
1147 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1148 if (!input_) return false;
1149 const int in_stride = in_width;
1150
1151 for (int c = 0; c < in_channels; ++c) {
1152 inputs[c] = input_ + c * in_stride * in_height;
1153 float *input =
1154 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1155
1156 if (cnn_config->strict_bounds) {
1157 for (int i = 0; i < height; ++i)
1158 for (int j = 0; j < width; ++j)
1159 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1160 // extend left and right
1161 for (int i = 0; i < height; ++i) {
1162 for (int j = -cnn_config->ext_width; j < 0; ++j)
1163 input[i * in_stride + j] = input[i * in_stride];
1164 for (int j = width; j < width + cnn_config->ext_width; ++j)
1165 input[i * in_stride + j] = input[i * in_stride + width - 1];
1166 }
1167 // extend top and bottom
1168 for (int i = -cnn_config->ext_height; i < 0; ++i)
1169 memcpy(&input[i * in_stride - cnn_config->ext_width],
1170 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1171 for (int i = height; i < height + cnn_config->ext_height; ++i)
1172 memcpy(&input[i * in_stride - cnn_config->ext_width],
1173 &input[(height - 1) * in_stride - cnn_config->ext_width],
1174 in_width * sizeof(*input));
1175 } else {
1176 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1177 ++i)
1178 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1179 ++j)
1180 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1181 }
1182 }
1183
1184 bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
1185 in_stride, cnn_config, thread_data, output);
1186
1187 aom_free(input_);
1188 return success;
1189 }
1190