• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/u_inlines.h"
7 
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_nn.h"
12 
13 #define ETNA_NN_INT8 0
14 
15 #define SRAM_CACHE_MODE_NO_CACHE 0x0
16 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
17 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
18 
19 enum pooling_type {
20     ETNA_NN_POOLING_NON,
21     ETNA_NN_POOLING_MAX,
22     ETNA_NN_POOLING_AVG,
23     ETNA_NN_POOLING_FIRST_PIXEL
24 };
25 
26 #define FIELD(field, bits) uint32_t field : bits;
27 
28 struct etna_nn_params {
29 
30    FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
31    FIELD(no_z_offset, 1)
32    FIELD(kernel_xy_size, 4)
33    FIELD(kernel_z_size, 14) /* & 0x3FFF */
34    FIELD(kernels_per_core, 7)
35    FIELD(pooling, 2)
36    FIELD(pooling_xy_size, 1)
37    FIELD(prelu, 1)
38    FIELD(nn_layer_flush, 1)
39 
40    /* 1 */
41    FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
42    FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
43    FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44    FIELD(in_image_x_size, 13)
45    FIELD(in_image_y_size, 13)
46 
47    /* 2 */
48    FIELD(in_image_x_offset, 3)
49    FIELD(in_image_y_offset, 3)
50    FIELD(unused0, 1)
51    FIELD(brick_mode, 1)
52    FIELD(brick_distance, 16)
53    FIELD(relu, 1)
54    FIELD(unused1, 1)
55    FIELD(post_multiplier, 1)
56    FIELD(post_shift, 5)
57 
58    /* 3 */
59    FIELD(unused2, 3)
60    FIELD(no_flush, 1)
61    FIELD(unused3, 2)
62    FIELD(out_image_x_size, 13)
63    FIELD(out_image_y_size, 13)
64 
65    /* 4 */
66    /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
67    FIELD(out_image_z_size, 14)
68    FIELD(rounding_mode, 2)
69    FIELD(in_image_x_offset_bit_3, 1) /*  >> 3 & 0x1 */
70    FIELD(in_image_y_offset_bit_3, 1) /*  >> 3 & 0x1 */
71    FIELD(out_image_tile_x_size, 7)
72    FIELD(out_image_tile_y_size, 7)
73 
74    /* 5 */
75    FIELD(kernel_address, 26) /* >> 6 */
76    FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
77 
78    /* 6 */
79    FIELD(in_image_address, 32)
80 
81    /* 7 */
82    FIELD(out_image_address, 32)
83 
84    /* 8 */
85    FIELD(image_caching_mode, 2)
86    FIELD(kernel_caching_mode, 2)
87    FIELD(partial_cache_data_unit, 2)
88    FIELD(kernel_pattern_msb, 6)
89    FIELD(kernel_y_size, 4)
90    FIELD(out_image_y_stride, 16)
91 
92    /* 9 */
93    FIELD(kernel_pattern_low, 32)
94 
95    /* 10 */
96    FIELD(kernel_pattern_high, 32)
97 
98    /* 11 */
99    FIELD(kernel_cache_start_address, 32)
100 
101    /* 12 */
102    FIELD(kernel_cache_end_address, 32)
103 
104    /* 13 */
105    FIELD(image_cache_start_address, 32)
106 
107    /* 14 */
108    FIELD(image_cache_end_address, 32)
109 
110    /* 15 */
111    FIELD(in_image_border_mode, 2)
112    FIELD(in_image_border_const, 16)
113    FIELD(unused4, 1)
114    FIELD(kernel_data_type_bit_2, 1)
115    FIELD(in_image_data_type_bit_2, 1)
116    FIELD(out_image_data_type_bit_2, 1)
117    FIELD(post_multiplier_1_to_6, 6)
118    FIELD(post_shift_bit_5_6, 2)
119    FIELD(unused5, 2)
120 
121    /* 16 */
122    FIELD(in_image_x_stride, 16)
123    FIELD(in_image_y_stride, 16)
124 
125    /* 17 */
126    FIELD(out_image_x_stride, 16)
127    FIELD(unused6, 8)
128    FIELD(post_multiplier_7_to_14, 8)
129 
130    /* 18 */
131    FIELD(out_image_circular_buf_size, 26) /* >> 6 */
132    FIELD(unused7, 5)
133    FIELD(per_channel_post_mul, 1)
134 
135    /* 19 */
136    FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
137    FIELD(unused8, 6)
138 
139    /* 20 */
140    FIELD(in_image_circular_buf_size, 26) /* >> 6 */
141    FIELD(unused9, 6)
142 
143    /* 21 */
144    FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
145    FIELD(unused10, 6)
146 
147    /* 22 */
148    FIELD(coef_zero_point, 8)
149    FIELD(out_zero_point, 8)
150    FIELD(kernel_direct_stream_from_VIP_sram, 1)
151    FIELD(depthwise, 1)
152    FIELD(unused11, 14)
153 
154    /* 23, from here they aren't set on  */
155    FIELD(unused12, 32)
156 
157    /* 24 */
158    FIELD(unused13, 4)
159    FIELD(unused14, 28)  /* 0 >> 4 */
160 
161    /* 25 */
162    FIELD(unused15, 4)
163    FIELD(unused16, 28)  /* 0 >> 4 */
164 
165    /* 26 */
166    FIELD(further1, 32)
167    FIELD(further2, 32)
168    FIELD(further3, 32)
169    FIELD(further4, 32)
170    FIELD(further5, 32)
171    FIELD(further6, 32)
172    FIELD(further7, 32)
173    FIELD(further8, 32)
174 };
175 
176 static void *
map_resource(struct pipe_resource * resource)177 map_resource(struct pipe_resource *resource)
178 {
179    return etna_bo_map(etna_resource(resource)->bo);
180 }
181 
182 
183 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)184 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
185 {
186    /* Fill a Nx2x2xN tensor with zero_points */
187    struct pipe_context *context = subgraph->base.context;
188    uint8_t *input = map_resource(operation->weight_tensor);
189    unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
190    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
191                                                          new_size);
192    uint8_t *output = map_resource(output_res);
193 
194    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
195       uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
196       uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
197 
198       map_out[0] = map_in[0];
199       map_out[1] = operation->weight_zero_point;
200       map_out[2] = operation->weight_zero_point;
201       map_out[3] = operation->weight_zero_point;
202    }
203 
204    pipe_resource_reference(&operation->weight_tensor, NULL);
205    operation->weight_tensor = output_res;
206 
207    operation->weight_width = operation->weight_height = 2;
208    operation->pointwise = false;
209 }
210 
211 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)212 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
213 {
214    struct pipe_context *context = subgraph->base.context;
215    uint8_t *input = map_resource(operation->weight_tensor);
216    unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
217    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
218                                                          new_size);
219    uint8_t *output = map_resource(output_res);
220 
221    /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
222    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
223       unsigned in_channel = channel / operation->output_channels;
224       unsigned in_depth = channel % operation->output_channels;
225 
226       uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
227       uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
228 
229       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
230          if (i % operation->input_channels == in_depth)
231             map_out[i] = map_in[i];
232          else
233             map_out[i] = operation->weight_zero_point;
234       }
235    }
236 
237    pipe_resource_reference(&operation->weight_tensor, NULL);
238    operation->weight_tensor = output_res;
239 }
240 
241 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)242 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
243 {
244    struct pipe_context *context = subgraph->base.context;
245    void *map = map_resource(operation->weight_tensor);
246    unsigned new_size = operation->output_channels * operation->weight_width * \
247                        operation->weight_height * operation->input_channels;
248    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
249                                                          new_size);
250    uint8_t *output = map_resource(output_res);
251    unsigned output_channels = operation->output_channels;
252    unsigned input_channels = operation->input_channels;
253 
254    if (operation->addition) {
255       output_channels = 1;
256       input_channels = 2;
257    }
258 
259    uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
260    unsigned i = 0;
261    for (unsigned d0 = 0; d0 < output_channels; d0++)
262       for (unsigned d3 = 0; d3 < input_channels; d3++)
263          for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
264             for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
265                ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
266 
267    pipe_resource_reference(&operation->weight_tensor, NULL);
268    operation->weight_tensor = output_res;
269 }
270 
271 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)272 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
273 {
274    uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
275    uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
276 
277    for(unsigned x = 0; x < out_width; x++)
278       for(unsigned y = 0; y < out_height; y++) {
279          unsigned in_x = x * stride + offset_x;
280          unsigned in_y = y * stride + offset_y;
281          if (in_x < in_width && in_y < in_height)
282             out[x][y] = in[in_x][in_y][in_z];
283          else
284             out[x][y] = in_zp;
285       }
286 }
287 
288 /* TODO: Do the reshaping in the TP units, for big enough buffers */
289 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])290 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
291 {
292    for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
293       void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
294       void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
295 
296       /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
297       /* This is only valid for stride == 2 */
298       assert(stride == 2);
299       uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
300       for (unsigned z = 0; z < dims_in[3]; z++) {
301          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
302          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
303          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
304          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
305       }
306    }
307 }
308 
309 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)310 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
311 {
312    struct pipe_context *context = subgraph->base.context;
313    uint8_t *input = map_resource(operation->weight_tensor);
314    unsigned new_size;
315    struct pipe_resource *output_res;
316    uint8_t *output;
317 
318    /* The hardware doesn't support strides natively, so we "lower" them as
319       * described in this paper:
320       *
321       * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
322       */
323 
324    /* TODO: Support more strides */
325    assert(operation->stride == 2);
326 
327    unsigned wdims_in[4] = {operation->output_channels,
328                            operation->weight_width,
329                            operation->weight_height,
330                            operation->input_channels};
331 
332    operation->input_channels = operation->input_channels * operation->stride * operation->stride;
333    operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
334    operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
335 
336    if (operation->padding_same) {
337       if (operation->weight_width == 5) {
338          operation->input_width += 2;
339          operation->input_height += 2;
340       } else {
341          operation->input_width += 1;
342          operation->input_height += 1;
343       }
344    }
345 
346    operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
347    operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
348 
349    new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
350    output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, new_size);
351    output = map_resource(output_res);
352 
353    unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
354    reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out);
355 
356    pipe_resource_reference(&operation->weight_tensor, NULL);
357    operation->weight_tensor = output_res;
358 }
359 
360 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)361 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
362                           const struct pipe_ml_operation *poperation,
363                           struct etna_operation *operation)
364 {
365    /* TODO: Support stride_x != stride_y */
366    assert(poperation->conv.stride_x == poperation->conv.stride_y);
367    assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
368 
369    operation->type = ETNA_JOB_TYPE_NN;
370    operation->addition = false;
371    operation->depthwise = poperation->conv.depthwise;
372    operation->pointwise = poperation->conv.pointwise;
373    operation->pooling_first_pixel = poperation->conv.stride_x > 1 && \
374       (poperation->conv.depthwise || poperation->conv.pointwise);
375    operation->padding_same = poperation->conv.padding_same;
376    operation->stride = poperation->conv.stride_x;
377 
378    operation->input_tensor = poperation->input_tensor->index;
379    operation->input_width = poperation->input_tensor->dims[1];
380    operation->input_height = poperation->input_tensor->dims[2];
381    operation->input_channels = poperation->input_tensor->dims[3];
382    operation->input_zero_point = poperation->input_tensor->zero_point;
383    operation->input_scale = poperation->input_tensor->scale;
384 
385    operation->output_tensor = poperation->output_tensor->index;
386    operation->output_width = poperation->output_tensor->dims[1];
387    operation->output_height = poperation->output_tensor->dims[2];
388    operation->output_channels = poperation->output_tensor->dims[3];
389    operation->output_zero_point = poperation->output_tensor->zero_point;
390    operation->output_scale = poperation->output_tensor->scale;
391 
392    pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
393    operation->weight_width = poperation->conv.weight_tensor->dims[1];
394    operation->weight_height = poperation->conv.weight_tensor->dims[2];
395    operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
396    operation->weight_scale = poperation->conv.weight_tensor->scale;
397 
398    pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
399 
400    if (operation->pointwise && operation->input_channels == 1)
401       pointwise_to_2x2(subgraph, operation);
402 
403    if (operation->depthwise && (operation->output_channels > 1 || operation->stride > 1)) {
404 
405       if (operation->input_width < 8 && operation->input_width > 2)
406          operation->pooling_first_pixel = false;
407 
408       expand_depthwise(subgraph, operation);
409    }
410 
411    if (operation->stride > 1 && !operation->pooling_first_pixel)
412       strided_to_normal(subgraph, operation);  /* This will already transpose if input_channels > 1 */
413    else if (operation->input_channels > 1)
414       transpose(subgraph, operation);
415 
416    operation->input_tensor_size = operation->input_width *
417                                   operation->input_height *
418                                   operation->input_channels;
419    ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
420 }
421 
422 static float
compute_weight_scale_add(float input1_scale,float input2_scale)423 compute_weight_scale_add(float input1_scale, float input2_scale)
424 {
425    double scale_ratio = input1_scale / input2_scale;
426 
427    return (float) MAX2(scale_ratio, 1.0) / 255.0;
428 }
429 
430 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)431 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
432 {
433   double addition_offset = input1_scale / input2_scale;
434   addition_offset /= weight_scale;
435   return round(addition_offset + 0.0) * 1;
436 }
437 
438 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)439 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
440 {
441    double weight = 1.0 / weight_scale;
442    return round(weight + 0.0);
443 }
444 
445 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)446 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
447 {
448    int zero_point_diff = input2_zp - input1_zp;
449    double bias = zero_point_diff * input1_scale;
450    bias /= weight_scale * input2_scale;
451 
452    double addition_offset = input1_scale / input2_scale;
453    addition_offset /= weight_scale;
454    addition_offset = round(addition_offset + 0.0) * 1;
455 
456    return (int) (round(bias) - round(addition_offset) * input2_zp);
457 }
458 
459 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)460 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
461                   const struct pipe_ml_operation *poperation,
462                   struct etna_operation *operation)
463 {
464    struct pipe_context *context = subgraph->base.context;
465 
466    assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
467 
468    operation->addition = true;
469    operation->depthwise = false;
470    operation->pointwise = false;
471    operation->pooling_first_pixel = false;
472    operation->padding_same = false;
473    operation->stride = 1;
474 
475    operation->input_tensor = poperation->input_tensor->index;
476    operation->add_input_tensor = poperation->add.input_tensor->index;
477    operation->input_width = poperation->input_tensor->dims[1];
478    operation->input_height = poperation->input_tensor->dims[2];
479    operation->input_channels = poperation->input_tensor->dims[3];
480    operation->input_zero_point = poperation->input_tensor->zero_point;
481    operation->input_scale = poperation->input_tensor->scale;
482    operation->input_tensor_size = operation->input_width *
483                                   operation->input_height *
484                                   operation->input_channels *
485                                   2;
486 
487    operation->output_tensor = poperation->output_tensor->index;
488    operation->output_width = poperation->output_tensor->dims[1];
489    operation->output_height = poperation->output_tensor->dims[2];
490    operation->output_channels = poperation->output_tensor->dims[3];
491    operation->output_zero_point = poperation->output_tensor->zero_point;
492    operation->output_scale = poperation->output_tensor->scale;
493 
494    operation->weight_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 8);
495    operation->weight_width = 2;
496    operation->weight_height = 2;
497    operation->weight_zero_point = 0x0;
498    operation->weight_scale = compute_weight_scale_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale);
499    operation->addition_offset = compute_addition_offset(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
500 
501    uint8_t *weight_map = map_resource(operation->weight_tensor);
502    memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
503    weight_map[0] = compute_weight_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
504 
505    operation->bias_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 4);
506    int32_t *bias_map = map_resource(operation->bias_tensor);
507    bias_map[0] = compute_bias_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale,
508                                   poperation->add.input_tensor->zero_point, poperation->input_tensor->zero_point,
509                                   operation->weight_scale);
510 }
511 
512 #define ACCUM_BUFFER_DEPTH 64
513 #define INPUT_BUFFER_DEPTH 12
514 #define MAX_TILE_WIDTH 64
515 
516 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)517 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
518 {
519    unsigned nn_core_count = ctx->screen->specs.nn_core_count;
520    unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count);
521    unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y;
522 
523    if (operation->weight_width == 1)
524       foo = MIN2(foo, ACCUM_BUFFER_DEPTH / 3);
525 
526    foo = MIN2(foo, kernels_per_core);
527    foo = MIN2(foo, 127);
528 
529    kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo);
530    unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count);
531    unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels);
532 
533    /* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */
534    while(operation->output_channels % superblocks)
535       superblocks++;
536 
537    ML_DBG("superblocks %d\n", superblocks);
538 
539    return superblocks;
540 }
541 
542 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)543 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
544 {
545    unsigned mode = 8;
546 
547    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
548       return 1;
549 
550    if (tile_width > MAX_TILE_WIDTH / 2)
551       mode = 1;
552    else if (tile_width > MAX_TILE_WIDTH / 4)
553       mode = 2;
554    else if (tile_width > MAX_TILE_WIDTH / 8)
555       mode = 4;
556 
557    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
558       return MIN2(mode, 4);
559 
560    return MIN2(mode, 2);
561 }
562 
563 static void
calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)564 calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
565                     unsigned *output_width, unsigned *output_height, unsigned *output_channels)
566 {
567    ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
568 
569    unsigned channel_size = *input_width * *input_height;
570    unsigned width = 0;
571    if (channel_size % 128 == 0)
572       width = 128;
573    else if (channel_size % 64 == 0)
574       width = 64;
575    else if (channel_size % 32 == 0)
576       width = 32;
577    else {
578       for (int i = 63; i > 0; i--) {
579          if (channel_size % i == 0) {
580             width = i;
581             break;
582          }
583       }
584    }
585 
586    *input_height = (*input_width * *input_height * *input_channels) / width;
587    *input_width = width;
588    *input_channels = 2;
589 
590    *output_height = *output_width * *output_height * *output_channels / width;
591    *output_width = width;
592    *output_channels = 1;
593 }
594 
595 static unsigned
calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)596 calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
597 {
598    unsigned input_width = operation->input_width;
599    unsigned input_height = operation->input_height;
600    unsigned input_channels = operation->input_channels;
601    unsigned output_width = operation->output_width;
602    unsigned output_height = operation->output_height;
603    unsigned output_channels = operation->output_channels;
604    unsigned tile_width;
605    unsigned tile_height;
606    unsigned superblocks;
607    unsigned interleave_mode;
608 
609    if (operation->addition)
610       calc_addition_sizes(&input_width, &input_height, &input_channels,
611                           &output_width, &output_height, &output_channels);
612 
613    if (operation->pooling_first_pixel) {
614       output_width *= 2;
615       output_height *= 2;
616    }
617 
618    tile_width = MIN2(output_width, 64);
619    interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
620 
621    tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1;
622    ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width);
623    tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH);
624    //tile_height = MIN2(tile_height, operation->input_width);
625    tile_height = MIN2(tile_height, output_height);
626 
627    if (operation->stride > 1 && tile_height % 2 > 0)
628       tile_height -= 1;
629 
630    superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
631    ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks);
632 
633    if (tile_width_out)
634       *tile_width_out = tile_width;
635 
636    if (tile_height_out)
637       *tile_height_out = tile_height;
638 
639    return superblocks;
640 }
641 
642 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coefficients_size)643 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coefficients_size)
644 {
645    struct pipe_context *context = subgraph->base.context;
646    struct etna_context *ctx = etna_context(context);
647    unsigned nn_core_count = ctx->screen->specs.nn_core_count;
648    unsigned oc_sram_size = ctx->screen->specs.on_chip_sram_size;
649    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
650                                     sizeof(struct etna_nn_params),
651                                     DRM_ETNA_GEM_CACHE_WC);
652    unsigned input_width = operation->input_width;
653    unsigned input_height = operation->input_height;
654    unsigned input_channels = operation->input_channels;
655    unsigned output_width = operation->output_width;
656    unsigned output_height = operation->output_height;
657    unsigned output_channels = operation->output_channels;
658    unsigned weight_width = operation->weight_width;
659    unsigned weight_height = operation->weight_height;
660 
661    if (operation->pointwise && input_channels == 1)
662       weight_width = weight_height = 2;
663 
664    if (operation->addition)
665       calc_addition_sizes(&input_width, &input_height, &input_channels,
666                           &output_width, &output_height, &output_channels);
667 
668    unsigned input_size = input_width * input_height * input_channels;
669 
670    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
671 
672    struct etna_nn_params *map = etna_bo_map(bo);
673    map->layer_type = 0x0;
674    map->no_z_offset = 0x0;
675    map->prelu = 0x0;
676    map->nn_layer_flush = 0x1;
677    map->brick_mode = 0x0;
678    map->brick_distance = 0x0;
679    map->relu = 0x0;
680    map->no_flush = 0x0;
681    map->rounding_mode = 0x1;
682    map->partial_cache_data_unit = 0x0;
683    map->depthwise = 0x0;
684 
685    map->unused0 = 0x0;
686    map->unused1 = 0x0;
687    map->unused2 = 0x0;
688    map->unused3 = 0x0;
689    map->unused4 = 0x0;
690    map->unused5 = 0x0;
691    map->unused6 = 0x0;
692    map->unused7 = 0x0;
693    map->unused8 = 0x0;
694    map->unused9 = 0x0;
695    map->unused10 = 0x0;
696    map->unused11 = 0x0;
697    map->unused12 = 0x0;
698    map->unused13 = 0x0;
699    map->unused14 = 0x0;
700    map->further1 = 0x0;
701    map->further2 = 0x0;
702    map->further3 = 0x3ffffff;
703    map->further4 = 0x7f800000;
704    map->further5 = 0xff800000;
705    map->further6 = 0x0;
706    map->further7 = 0x0;
707    map->further8 = 0x0;
708 
709    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
710    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
711    map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
712    map->in_image_x_size = input_width;
713    map->in_image_y_size = input_height;
714    map->in_image_x_stride = input_width;
715    map->in_image_y_stride = input_height;
716    map->in_image_data_type = ETNA_NN_INT8;
717    map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
718    map->in_image_circular_buf_size = 0x0;
719    map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
720    map->in_image_border_mode = 0x0;
721    map->in_image_border_const = operation->input_zero_point;
722 
723    if (operation->padding_same && operation->stride == 1 && weight_width > 2) {
724       if (weight_width < 5) {
725          map->in_image_x_offset = 0x7;
726          map->in_image_y_offset = 0x7;
727       } else {
728          map->in_image_x_offset = 0x6;
729          map->in_image_y_offset = 0x6;
730       }
731       map->in_image_x_offset_bit_3 = 0x1;
732       map->in_image_y_offset_bit_3 = 0x1;
733    } else {
734       map->in_image_x_offset = 0x0;
735       map->in_image_y_offset = 0x0;
736       map->in_image_x_offset_bit_3 = 0x0;
737       map->in_image_y_offset_bit_3 = 0x0;
738    }
739 
740    if (operation->padding_same && operation->stride == 2 && weight_width == 5) {
741       map->in_image_x_offset = 0x7;
742       map->in_image_y_offset = 0x7;
743       map->in_image_x_offset_bit_3 = 0x1;
744       map->in_image_y_offset_bit_3 = 0x1;
745    }
746 
747    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
748    offset = etna_ml_get_offset(subgraph, operation->output_tensor);
749    map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
750    map->out_image_x_size = output_width;
751    map->out_image_y_size = output_height;
752    map->out_image_z_size = output_channels;
753 
754    map->out_image_x_stride = map->out_image_x_size;
755    map->out_image_y_stride = map->out_image_y_size;
756 
757    map->out_image_data_type = ETNA_NN_INT8;
758    map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
759    map->out_image_circular_buf_size = 0x0;
760    map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
761    map->out_zero_point = operation->output_zero_point;
762 
763    if (operation->pooling_first_pixel) {
764       map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
765       map->pooling_xy_size = 0x0;
766 
767       map->out_image_x_size *= 2;
768       map->out_image_y_size *= 2;
769    } else {
770       map->pooling = ETNA_NN_POOLING_NON;
771       map->pooling_xy_size = 0x1;
772    }
773 
774    unsigned tile_x, tile_y;
775    unsigned superblocks = calculate_tiling(ctx, operation, &tile_x, &tile_y);
776    map->out_image_tile_x_size = tile_x;
777    map->out_image_tile_y_size = tile_y;
778 
779    map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
780    map->kernel_xy_size = weight_width;
781    map->kernel_y_size = weight_height;
782    map->kernel_z_size = input_channels;
783    map->kernel_z_size2 = 0x0;
784    map->kernel_data_type = ETNA_NN_INT8;
785    map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
786    map->kernel_direct_stream_from_VIP_sram = 0x0;
787 
788    map->coef_zero_point = operation->weight_zero_point;
789 
790    map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
791 
792    /* Should be max accumBufferDepth (64) / zdpNum (3) */
793    //assert(map->kernels_per_core <= (64 / 3));
794 
795    /* The header doesn't get cached */
796    coefficients_size -= 64;
797 
798    map->kernel_cache_start_address = 0x800;
799    map->kernel_cache_end_address = MAX2(MIN2(map->kernel_cache_start_address + coefficients_size, oc_sram_size), 0x1a00);
800 
801    if (output_channels <= 128 || map->kernel_cache_end_address == oc_sram_size) {
802       map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
803       map->image_cache_start_address = 0x0;
804       map->image_cache_end_address = 0x800;
805    } else {
806       map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
807       map->image_cache_start_address = map->kernel_cache_end_address;
808       map->image_cache_end_address = MIN2(map->image_cache_start_address + input_size + 1024, oc_sram_size);
809    }
810 
811    /* TODO: Look at re-enabling the image cache again */
812    map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
813    map->image_cache_start_address = 0x0;
814    map->image_cache_end_address = 0x800;
815 
816    if (etna_bo_size(coefficients) <= 0x80000 - 0x800) {
817       map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
818       map->kernel_pattern_msb = 0x0;
819       map->kernel_pattern_low = 0x0;
820       map->kernel_pattern_high = 0x0;
821    } else {
822       /* Doesn't fit in the 512KB we have of on-chip SRAM */
823       map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
824       if (map->out_image_z_size >= 1024) {
825          map->kernel_pattern_msb = 0x13;
826          map->kernel_pattern_low = 0x80000;
827          map->kernel_pattern_high = 0x0;
828       } else if (map->out_image_z_size >= 512) {
829          map->kernel_pattern_msb = 0x3d;
830          map->kernel_pattern_low = 0x0;
831          map->kernel_pattern_high = 0x2aaaaaa0;
832       } else if (map->out_image_z_size >= 256) {
833          map->kernel_pattern_msb = 0x3e;
834          map->kernel_pattern_low = 0xffffaaaa;
835          map->kernel_pattern_high = 0x7fffffff;
836       } else if (map->out_image_z_size >= 160) {
837          map->kernel_pattern_msb = 0x6;
838          map->kernel_pattern_low = 0x7e;
839          map->kernel_pattern_high = 0x0;
840       } else {
841          map->kernel_pattern_msb = 0x3f;
842          map->kernel_pattern_low = 0xfffffffe;
843          map->kernel_pattern_high = 0xffffffff;
844       }
845    }
846 
847    float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
848    uint32_t scale_bits = fui(conv_scale);
849    /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
850    unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16;
851 
852    /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
853    map->post_shift = shift & 0x1f;
854    map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
855 
856    /* Multiplies by (multiplier * 2^15) */
857    map->post_multiplier = (scale_bits >> 8) & 0x1;
858    map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
859    map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
860 
861    map->per_channel_post_mul = 0x0;
862 
863    etna_bo_cpu_fini(bo);
864 
865    return bo;
866 }
867 
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)868 static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
869 {
870    int32_t correction = 0;
871 
872    for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
873       correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
874    }
875 
876    return correction;
877 }
878 
879 static void
write_6_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)880 write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
881 {
882    struct pipe_context *pctx = subgraph->base.context;
883    unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
884    unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
885    uint8_t *input = map_resource(operation->weight_tensor);
886    uint32_t *biases = map_resource(operation->bias_tensor);
887    unsigned out_values_per_channel = operation->output_width * operation->output_height;
888    unsigned stride = MIN2(operation->input_channels, 6);
889    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
890    uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
891 
892    ML_DBG("%s\n", __func__);
893 
894    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
895 
896       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
897       if (superblock == superblocks - 1)
898          kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
899 
900       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
901          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
902          weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels;
903       }
904 
905       for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) {
906          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
907             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
908 
909             if (block == 0) {
910                *map++ = weights_maps[kernel][0];
911 
912                uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
913                //fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]);
914                *((uint32_t *)map) = biases[out_channel] - corr;
915                map += sizeof(uint32_t);
916 
917                for (int i = 1; i < stride; i++) {
918                   *map++ = weights_maps[kernel][i];
919                }
920             } else {
921                for (int i = 0; i < stride; i++) {
922                   if (i + block * stride < operation->input_channels)
923                      *map++ = weights_maps[kernel][i + block * stride];
924                }
925             }
926             if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) {
927                *((uint32_t*)map) = out_values_per_channel * out_channel;
928                map += sizeof(uint32_t);
929             }
930          }
931       }
932    }
933 }
934 
935 static void
write_interleaved_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)936 write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
937 {
938    struct pipe_context *pctx = subgraph->base.context;
939    unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
940    unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
941    uint8_t *input = map_resource(operation->weight_tensor);
942    uint32_t *biases = map_resource(operation->bias_tensor);
943    unsigned out_values_per_channel = operation->output_width * operation->output_height;
944    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
945    uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input;
946 
947    ML_DBG("%s core %d\n", __func__, core);
948 
949    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
950 
951       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
952       if (superblock == superblocks - 1)
953          kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
954 
955       for (unsigned z = 0; z < operation->input_channels; z++) {
956          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
957             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
958 
959 #if 0
960             if (z == 0)
961                fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n",
962                        core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel);
963 #endif
964 
965             for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
966                unsigned stride = operation->weight_height;
967                if (operation->weight_height > 3)
968                   stride = 3;
969                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
970                   if (x >= operation->weight_width)
971                      break;
972                   for (unsigned y = 0; y < stride; y++) {
973                      //fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]);
974                      *map++ = weights_map[out_channel][z][x][y];
975                      if (x == 0 && y == 0 && z == 0) {
976                         uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
977                         //fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]);
978                         *((uint32_t *)map) = biases[out_channel] - corr;
979                         map += sizeof(uint32_t);
980                      }
981                   }
982                }
983                if (operation->weight_height > 3) {
984                   for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
985                      if (x >= operation->weight_width)
986                         break;
987                      for (unsigned y = stride; y < operation->weight_width; y++) {
988                         //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]);
989                         *map++ = weights_map[out_channel][z][x][y];
990                      }
991                   }
992                }
993             }
994 
995             if (z == operation->input_channels - 1) {
996                *((uint32_t*)map) = out_values_per_channel * out_channel;
997                map += sizeof(uint32_t);
998             }
999          }
1000       }
1001    }
1002 }
1003 
1004 static void
write_sequential_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)1005 write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
1006 {
1007    struct pipe_context *pctx = subgraph->base.context;
1008    unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
1009    unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
1010    uint8_t *input = map_resource(operation->weight_tensor);
1011    uint32_t *biases = map_resource(operation->bias_tensor);
1012    unsigned out_values_per_channel = operation->output_width * operation->output_height;
1013    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1014 
1015    ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels);
1016 
1017    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1018 
1019       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1020       if (superblock == superblocks - 1)
1021          kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
1022 
1023       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1024          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
1025 
1026          uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
1027 
1028          for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1029             unsigned stride = operation->weight_height;
1030             if ((operation->depthwise || operation->input_width > 64) && \
1031                operation->weight_height > 3)
1032                stride = 3;
1033             for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1034                if (x >= operation->weight_width)
1035                   break;
1036                for (unsigned y = 0; y < stride; y++) {
1037                   //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
1038 
1039                   *map++ = weights_map[x][y];
1040                   if (x == 0 && y == 0) {
1041                      uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
1042                      *((uint32_t *)map) = biases[out_channel] - corr;
1043                      map += sizeof(uint32_t);
1044                   }
1045                }
1046             }
1047             if ((operation->depthwise || operation->input_width > 64) && \
1048                operation->weight_height > 3) {
1049                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1050                   if (x >= operation->weight_width)
1051                      break;
1052                   for (unsigned y = stride; y < operation->weight_width; y++) {
1053                      //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
1054                      *map++ = weights_map[x][y];
1055                   }
1056                }
1057             }
1058          }
1059          if (operation->addition) {
1060             *((uint32_t*)map) = operation->addition_offset;
1061          } else
1062             *((uint32_t*)map) = out_values_per_channel * out_channel;
1063          map += sizeof(uint32_t);
1064       }
1065    }
1066 }
1067 
1068 static struct etna_bo *
create_coefficients_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * size)1069 create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size)
1070 {
1071    /* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */
1072    struct pipe_context *context = subgraph->base.context;
1073    struct etna_context *ctx = etna_context(context);
1074    unsigned nn_core_count = ctx->screen->specs.nn_core_count;
1075    unsigned header_size = ALIGN(nn_core_count * 4, 64);
1076    unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */
1077    unsigned input_channels;
1078    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1079    unsigned cores_used = MIN2(output_channels, nn_core_count);
1080    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1081    uint8_t zero_length_encoding = false;
1082    unsigned weights_size;
1083    unsigned core_size;
1084    unsigned core_size_aligned;
1085 
1086    input_channels = operation->addition ? 1 : operation->input_channels;
1087    weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size;
1088    core_size = 3 + (weights_size + 4 + 4) * kernels_per_core;
1089    core_size_aligned = ALIGN(core_size, 64);
1090    *size = header_size + core_size_aligned * cores_used;
1091 
1092    struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
1093                                             *size,
1094                                             DRM_ETNA_GEM_CACHE_WC);
1095 
1096    etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
1097 
1098    uint8_t *map = etna_bo_map(compressed);
1099    uint32_t *header = (uint32_t *)map;
1100 
1101    memset(map, 0, *size);
1102 
1103    for (unsigned core = 0; core < cores_used; core++)
1104       header[core] = core_size_aligned;
1105 
1106    map += header_size;
1107 
1108 #if 0
1109    uint8_t *input = map_resource(operation->weight_tensor);
1110    for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++)
1111       fprintf(stderr, "i %d: %02x\n", i, input[i]);
1112 #endif
1113 
1114    for (unsigned core = 0; core < cores_used; core++) {
1115 
1116       *map++ = zero_length_encoding;
1117 
1118       *((uint16_t *)map) = kernels_per_core;
1119       map += sizeof(uint16_t);
1120 
1121       if (operation->pointwise && input_channels >= 1 && output_channels > 8)
1122          write_6_weight_format(subgraph, map, kernels_per_core, core, operation);
1123       else if (input_channels > 1)
1124          write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation);
1125       else
1126          write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation);
1127 
1128       map += core_size_aligned - 3;
1129    }
1130 
1131    etna_bo_cpu_fini(compressed);
1132 
1133    return compressed;
1134 }
1135 
1136 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1137 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1138                              struct etna_vip_instruction *instruction)
1139 {
1140    unsigned coefficients_size;
1141 
1142    instruction->type = ETNA_JOB_TYPE_NN;
1143    instruction->coefficients = create_coefficients_bo(subgraph, operation, &coefficients_size);
1144 
1145    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
1146    assert(input);
1147    pipe_resource_reference(&instruction->input, input);
1148 
1149    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
1150    assert(output);
1151    pipe_resource_reference(&instruction->output, output);
1152 
1153    instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coefficients_size);
1154 }
1155 
1156 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1157 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1158                           struct etna_vip_instruction *operation,
1159                           unsigned idx)
1160 {
1161    struct pipe_context *pctx = subgraph->base.context;
1162    struct etna_context *ctx = etna_context(pctx);
1163    struct etna_cmd_stream *stream = ctx->stream;
1164    unsigned offset = idx + 1;
1165    unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1166 
1167    if (DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL)) {
1168       nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1169       offset = 0;
1170    }
1171 
1172    etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1173    etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1174 
1175    etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1176    etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1177       .bo = operation->configs[0],
1178       .flags = ETNA_RELOC_READ,
1179       .offset = offset,
1180    });
1181    etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1182 }
1183