• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "pipe/p_state.h"
7 #include "util/u_inlines.h"
8 
9 #include "etnaviv_context.h"
10 #include "etnaviv_debug.h"
11 #include "etnaviv_emit.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_nn.h"
14 
15 #define ETNA_NN_INT8 0
16 
17 #define SRAM_CACHE_MODE_NO_CACHE 0x0
18 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
19 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
20 
21 enum pooling_type {
22     ETNA_NN_POOLING_NON,
23     ETNA_NN_POOLING_MAX,
24     ETNA_NN_POOLING_AVG,
25     ETNA_NN_POOLING_FIRST_PIXEL
26 };
27 
28 #define FIELD(field, bits) uint32_t field : bits;
29 
30 struct etna_nn_params {
31 
32    FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
33    FIELD(no_z_offset, 1)
34    FIELD(kernel_xy_size, 4)
35    FIELD(kernel_z_size, 14) /* & 0x3FFF */
36    FIELD(kernels_per_core, 7)
37    FIELD(pooling, 2)
38    FIELD(pooling_xy_size, 1)
39    FIELD(prelu, 1)
40    FIELD(nn_layer_flush, 1)
41 
42    /* 1 */
43    FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44    FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
45    FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
46    FIELD(in_image_x_size, 13)
47    FIELD(in_image_y_size, 13)
48 
49    /* 2 */
50    FIELD(in_image_x_offset, 3)
51    FIELD(in_image_y_offset, 3)
52    FIELD(unused0, 1)
53    FIELD(brick_mode, 1)
54    FIELD(brick_distance, 16)
55    FIELD(relu, 1)
56    FIELD(unused1, 1)
57    FIELD(post_multiplier, 1)
58    FIELD(post_shift, 5)
59 
60    /* 3 */
61    FIELD(unused2, 3)
62    FIELD(no_flush, 1)
63    FIELD(unused3, 2)
64    FIELD(out_image_x_size, 13)
65    FIELD(out_image_y_size, 13)
66 
67    /* 4 */
68    /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
69    FIELD(out_image_z_size, 14)
70    FIELD(rounding_mode, 2)
71    FIELD(in_image_x_offset_bit_3, 1) /*  >> 3 & 0x1 */
72    FIELD(in_image_y_offset_bit_3, 1) /*  >> 3 & 0x1 */
73    FIELD(out_image_tile_x_size, 7)
74    FIELD(out_image_tile_y_size, 7)
75 
76    /* 5 */
77    FIELD(kernel_address, 26) /* >> 6 */
78    FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
79 
80    /* 6 */
81    FIELD(in_image_address, 32)
82 
83    /* 7 */
84    FIELD(out_image_address, 32)
85 
86    /* 8 */
87    FIELD(image_caching_mode, 2)
88    FIELD(kernel_caching_mode, 2)
89    FIELD(partial_cache_data_unit, 2)
90    FIELD(kernel_pattern_msb, 6)
91    FIELD(kernel_y_size, 4)
92    FIELD(out_image_y_stride, 16)
93 
94    /* 9 */
95    FIELD(kernel_pattern_low, 32)
96 
97    /* 10 */
98    FIELD(kernel_pattern_high, 32)
99 
100    /* 11 */
101    FIELD(kernel_cache_start_address, 32)
102 
103    /* 12 */
104    FIELD(kernel_cache_end_address, 32)
105 
106    /* 13 */
107    FIELD(image_cache_start_address, 32)
108 
109    /* 14 */
110    FIELD(image_cache_end_address, 32)
111 
112    /* 15 */
113    FIELD(in_image_border_mode, 2)
114    FIELD(in_image_border_const, 16)
115    FIELD(unused4, 1)
116    FIELD(kernel_data_type_bit_2, 1)
117    FIELD(in_image_data_type_bit_2, 1)
118    FIELD(out_image_data_type_bit_2, 1)
119    FIELD(post_multiplier_1_to_6, 6)
120    FIELD(post_shift_bit_5_6, 2)
121    FIELD(unused5, 2)
122 
123    /* 16 */
124    FIELD(in_image_x_stride, 16)
125    FIELD(in_image_y_stride, 16)
126 
127    /* 17 */
128    FIELD(out_image_x_stride, 16)
129    FIELD(unused6, 8)
130    FIELD(post_multiplier_7_to_14, 8)
131 
132    /* 18 */
133    FIELD(out_image_circular_buf_size, 26) /* >> 6 */
134    FIELD(per_channel_post_mul, 1)
135    FIELD(unused7_0, 1)
136    FIELD(unused7_1, 1)
137    FIELD(unused7_2, 1)
138    FIELD(unused7_3, 2)
139 
140    /* 19 */
141    FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
142    FIELD(unused8, 6)
143 
144    /* 20 */
145    FIELD(in_image_circular_buf_size, 26) /* >> 6 */
146    FIELD(unused9, 6)
147 
148    /* 21 */
149    FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
150    FIELD(unused10, 6)
151 
152    /* 22 */
153    FIELD(coef_zero_point, 8)
154    FIELD(out_zero_point, 8)
155    FIELD(kernel_direct_stream_from_VIP_sram, 1)
156    FIELD(depthwise, 1)
157    FIELD(post_multiplier_15_to_22, 8)
158    FIELD(unused11, 6)
159 
160    /* 23, from here they aren't set on  */
161    FIELD(unused12, 32)
162 
163    /* 24 */
164    FIELD(unused13, 4)
165    FIELD(unused14, 28)  /* 0 >> 4 */
166 
167    /* 25 */
168    FIELD(unused15, 4)
169    FIELD(unused16, 28)  /* 0 >> 4 */
170 
171    /* 26 */
172    FIELD(further1, 32)
173    FIELD(further2, 32)
174    FIELD(further3, 32)
175    FIELD(further4, 32)
176    FIELD(further5, 32)
177    FIELD(further6, 32)
178    FIELD(further7, 32)
179    FIELD(further8, 32)
180 };
181 
182 static void *
map_resource(struct pipe_resource * resource)183 map_resource(struct pipe_resource *resource)
184 {
185    return etna_bo_map(etna_resource(resource)->bo);
186 }
187 
188 
189 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)190 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
191 {
192    /* Fill a Nx2x2xN tensor with zero_points */
193    struct pipe_context *context = subgraph->base.context;
194    uint8_t *input = map_resource(operation->weight_tensor);
195    unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
196    struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
197    uint8_t *output = map_resource(output_res);
198 
199    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
200       uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
201       uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
202 
203       map_out[0] = map_in[0];
204       if (operation->weight_signed) {
205          map_out[1] = operation->weight_zero_point - 128;
206          map_out[2] = operation->weight_zero_point - 128;
207          map_out[3] = operation->weight_zero_point - 128;
208       } else {
209          map_out[1] = operation->weight_zero_point;
210          map_out[2] = operation->weight_zero_point;
211          map_out[3] = operation->weight_zero_point;
212       }
213    }
214 
215    pipe_resource_reference(&operation->weight_tensor, NULL);
216    operation->weight_tensor = output_res;
217 
218    operation->weight_width = operation->weight_height = 2;
219    operation->pointwise = false;
220 }
221 
222 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)223 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
224 {
225    struct pipe_context *context = subgraph->base.context;
226    uint8_t *input = map_resource(operation->weight_tensor);
227    unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
228    struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
229    uint8_t *output = map_resource(output_res);
230 
231    /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
232    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
233       unsigned in_channel = channel / operation->output_channels;
234       unsigned in_depth = channel % operation->output_channels;
235 
236       uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
237       uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
238 
239       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
240          if (i % operation->input_channels == in_depth)
241             map_out[i] = map_in[i];
242          else if (operation->weight_signed)
243             map_out[i] = operation->weight_zero_point - 128;
244          else
245             map_out[i] = operation->weight_zero_point;
246       }
247    }
248 
249    pipe_resource_reference(&operation->weight_tensor, NULL);
250    operation->weight_tensor = output_res;
251 }
252 
253 static void
reorder_for_hw_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)254 reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
255 {
256    struct pipe_context *context = subgraph->base.context;
257    uint8_t *input = map_resource(operation->weight_tensor);
258    struct pipe_resource *output_res = etna_ml_create_resource(context, pipe_buffer_size(operation->weight_tensor));
259    uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
260 
261    for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
262       unsigned out_channel = i % operation->output_channels;
263 
264       output[out_channel][i / operation->output_channels] = input[i];
265    }
266 
267    pipe_resource_reference(&operation->weight_tensor, NULL);
268    operation->weight_tensor = output_res;
269 }
270 
271 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)272 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
273 {
274    struct pipe_context *context = subgraph->base.context;
275    unsigned nn_core_version = etna_context(context)->screen->specs.nn_core_version;
276    void *map = map_resource(operation->weight_tensor);
277    unsigned new_size;
278    struct pipe_resource *output_res;
279    uint8_t *output;
280    unsigned output_channels = operation->output_channels;
281    unsigned input_channels;
282 
283    if (nn_core_version == 8 && operation->depthwise)
284       input_channels = 1;
285    else
286       input_channels = operation->input_channels;
287 
288    if (operation->addition) {
289       output_channels = 1;
290       input_channels = 2;
291    }
292 
293    new_size = operation->output_channels * operation->weight_width * \
294                      operation->weight_height * input_channels;
295    output_res = etna_ml_create_resource(context, new_size);
296    output = map_resource(output_res);
297 
298    uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
299    unsigned i = 0;
300    for (unsigned d0 = 0; d0 < output_channels; d0++)
301       for (unsigned d3 = 0; d3 < input_channels; d3++)
302          for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
303             for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
304                ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
305 
306    pipe_resource_reference(&operation->weight_tensor, NULL);
307    operation->weight_tensor = output_res;
308 }
309 
310 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)311 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
312 {
313    uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
314    uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
315 
316    for(unsigned x = 0; x < out_width; x++)
317       for(unsigned y = 0; y < out_height; y++) {
318          unsigned in_x = x * stride + offset_x;
319          unsigned in_y = y * stride + offset_y;
320          if (in_x < in_width && in_y < in_height)
321             out[x][y] = in[in_x][in_y][in_z];
322          else
323             out[x][y] = in_zp;
324       }
325 }
326 
327 /* TODO: Do the reshaping in the TP units, for big enough buffers */
328 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])329 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
330 {
331    for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
332       void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
333       void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
334 
335       /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
336       /* This is only valid for stride == 2 */
337       assert(stride == 2);
338       uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
339       for (unsigned z = 0; z < dims_in[3]; z++) {
340          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
341          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
342          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
343          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
344       }
345    }
346 }
347 
348 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)349 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
350 {
351    struct pipe_context *context = subgraph->base.context;
352    uint8_t *input = map_resource(operation->weight_tensor);
353    unsigned new_size;
354    struct pipe_resource *output_res;
355    uint8_t *output;
356 
357    /* The hardware doesn't support strides natively, so we "lower" them as
358       * described in this paper:
359       *
360       * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
361       */
362 
363    /* TODO: Support more strides */
364    assert(operation->stride == 2);
365 
366    unsigned wdims_in[4] = {operation->output_channels,
367                            operation->weight_width,
368                            operation->weight_height,
369                            operation->input_channels};
370 
371    operation->input_channels = operation->input_channels * operation->stride * operation->stride;
372    operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
373    operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
374 
375    if (operation->padding_same) {
376       if (operation->weight_width == 5) {
377          operation->input_width += 2;
378          operation->input_height += 2;
379       } else {
380          operation->input_width += 1;
381          operation->input_height += 1;
382       }
383    }
384 
385    operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
386    operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
387 
388    new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
389    output_res = etna_ml_create_resource(context, new_size);
390    output = map_resource(output_res);
391 
392    unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
393    int weight_zero_point = operation->weight_signed ? (operation->weight_zero_point - 128) : operation->weight_zero_point;
394    reshape(input, output, operation->stride, weight_zero_point, wdims_in, wdims_out);
395 
396    pipe_resource_reference(&operation->weight_tensor, NULL);
397    operation->weight_tensor = output_res;
398 }
399 
400 static bool
calc_pooling_first_pixel(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation)401 calc_pooling_first_pixel(struct etna_ml_subgraph *subgraph,
402                          const struct pipe_ml_operation *poperation)
403 {
404    struct pipe_context *context = subgraph->base.context;
405    unsigned nn_core_version = etna_context(context)->screen->specs.nn_core_version;
406    unsigned input_width = poperation->input_tensors[0]->dims[1];
407    unsigned input_channels = poperation->input_tensors[0]->dims[3];
408 
409    if (poperation->conv.stride_x == 1)
410       return false;
411 
412    if (poperation->conv.depthwise)
413       return true;
414 
415    if (nn_core_version < 8) {
416       if (poperation->conv.pointwise)
417          return true;
418    } else {
419       if (poperation->conv.pointwise && input_width >= 3 && input_channels > 1)
420          return true;
421 
422       if (poperation->conv.pointwise && poperation->conv.padding_same)
423          return true;
424    }
425 
426    return false;
427 }
428 
429 static inline uint8_t
etna_tensor_zero_point(struct pipe_tensor * tensor)430 etna_tensor_zero_point(struct pipe_tensor *tensor)
431 {
432    if (tensor->is_signed) {
433       /*
434        * Since the hardware only supports unsigned 8-bit integers, signed
435        * tensors are shifted from the -128..127 range to 0..255 by adding 128
436        * when uploading and subtracting 128 when downloading the tensor.
437        * Tensor zero point and weight coefficients have to be adapted to
438        * account for this.
439        */
440       assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
441       return tensor->zero_point + 128;
442    } else {
443       assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
444       return tensor->zero_point;
445    }
446 }
447 
448 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)449 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
450                           const struct pipe_ml_operation *poperation,
451                           struct etna_operation *operation)
452 {
453    struct pipe_context *context = subgraph->base.context;
454    struct etna_context *ctx = etna_context(context);
455    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
456 
457    /* TODO: Support stride_x != stride_y */
458    assert(poperation->conv.stride_x == poperation->conv.stride_y);
459    assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
460 
461    operation->type = ETNA_JOB_TYPE_NN;
462    operation->addition = false;
463    operation->depthwise = poperation->conv.depthwise;
464    operation->pointwise = poperation->conv.pointwise;
465    operation->relu = poperation->conv.relu;
466    operation->pooling_first_pixel = calc_pooling_first_pixel(subgraph, poperation);
467    operation->padding_same = poperation->conv.padding_same;
468    operation->stride = poperation->conv.stride_x;
469 
470    operation->input_tensors[0] = poperation->input_tensors[0]->index;
471    operation->input_count = 1;
472    operation->input_width = poperation->input_tensors[0]->dims[1];
473    operation->input_height = poperation->input_tensors[0]->dims[2];
474    operation->input_channels = poperation->input_tensors[0]->dims[3];
475    operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
476    operation->input_scale = poperation->input_tensors[0]->scale;
477 
478    operation->output_tensors[0] = poperation->output_tensors[0]->index;
479    operation->output_width = poperation->output_tensors[0]->dims[1];
480    operation->output_height = poperation->output_tensors[0]->dims[2];
481    operation->output_channels = poperation->output_tensors[0]->dims[3];
482    operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
483    operation->output_scale = poperation->output_tensors[0]->scale;
484 
485    pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
486    operation->weight_width = poperation->conv.weight_tensor->dims[1];
487    operation->weight_height = poperation->conv.weight_tensor->dims[2];
488    operation->weight_zero_point = etna_tensor_zero_point(poperation->conv.weight_tensor);
489    operation->weight_scale = poperation->conv.weight_tensor->scale;
490    operation->weight_signed = poperation->conv.weight_tensor->is_signed;
491 
492    pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
493 
494    if (operation->pointwise && operation->input_channels == 1)
495       pointwise_to_2x2(subgraph, operation);
496 
497    if (operation->depthwise) {
498       if (nn_core_version < 8 && (operation->output_channels > 1 || operation->stride > 1)) {
499          if (operation->input_width < 8 && operation->input_width > 2)
500             operation->pooling_first_pixel = false;
501          expand_depthwise(subgraph, operation);
502       } else if (operation->output_channels > 1)
503          reorder_for_hw_depthwise(subgraph, operation);
504    }
505 
506    if (operation->stride > 1 && !operation->pooling_first_pixel)
507       strided_to_normal(subgraph, operation);  /* This will already transpose if input_channels > 1 */
508    else if (operation->input_channels > 1)
509       transpose(subgraph, operation);
510 
511    operation->input_tensor_sizes[0] = operation->input_width *
512                                       operation->input_height *
513                                       operation->input_channels;
514    ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
515 
516    operation->output_tensor_sizes[0] = operation->output_width *
517                                        operation->output_height *
518                                        operation->output_channels;
519 }
520 
521 static float
compute_weight_scale_add(float input1_scale,float input2_scale)522 compute_weight_scale_add(float input1_scale, float input2_scale)
523 {
524    double scale_ratio = input1_scale / input2_scale;
525 
526    return (float) MAX2(scale_ratio, 1.0) / 255.0;
527 }
528 
529 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)530 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
531 {
532   double addition_offset = input1_scale / input2_scale;
533   addition_offset /= weight_scale;
534   return round(addition_offset + 0.0) * 1;
535 }
536 
537 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)538 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
539 {
540    double weight = 1.0 / weight_scale;
541    return round(weight + 0.0);
542 }
543 
544 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)545 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
546 {
547    int zero_point_diff = input2_zp - input1_zp;
548    double bias = zero_point_diff * input1_scale;
549    bias /= weight_scale * input2_scale;
550 
551    double addition_offset = input1_scale / input2_scale;
552    addition_offset /= weight_scale;
553    addition_offset = round(addition_offset + 0.0) * 1;
554 
555    return (int) (round(bias) - round(addition_offset) * input2_zp);
556 }
557 
558 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)559 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
560                   const struct pipe_ml_operation *poperation,
561                   struct etna_operation *operation)
562 {
563    struct pipe_context *context = subgraph->base.context;
564    struct etna_context *ctx = etna_context(context);
565    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
566 
567    assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
568 
569    operation->type = ETNA_JOB_TYPE_NN;
570    operation->addition = true;
571    operation->depthwise = false;
572    operation->pointwise = false;
573    operation->pooling_first_pixel = false;
574    operation->padding_same = false;
575    operation->stride = 1;
576 
577    operation->input_width = poperation->input_tensors[0]->dims[1];
578    operation->input_height = poperation->input_tensors[0]->dims[2];
579    operation->input_channels = poperation->input_tensors[0]->dims[3];
580    operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
581    operation->input_scale = poperation->input_tensors[0]->scale;
582 
583    operation->input_tensors[0] = poperation->input_tensors[0]->index;
584    operation->input_tensor_sizes[0] = operation->input_width *
585                                       operation->input_height *
586                                       operation->input_channels;
587    operation->input_tensors[1] = poperation->input_tensors[1]->index;
588    operation->input_tensor_sizes[1] = operation->input_width *
589                                       operation->input_height *
590                                       operation->input_channels;
591    operation->input_count = 2;
592 
593    operation->output_tensors[0] = poperation->output_tensors[0]->index;
594    operation->output_width = poperation->output_tensors[0]->dims[1];
595    operation->output_height = poperation->output_tensors[0]->dims[2];
596    operation->output_channels = poperation->output_tensors[0]->dims[3];
597    operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
598    operation->output_scale = poperation->output_tensors[0]->scale;
599 
600    operation->output_tensor_sizes[0] = operation->output_width *
601                                        operation->output_height *
602                                        operation->output_channels;
603 
604    if (nn_core_version < 8) {
605       operation->weight_tensor = etna_ml_create_resource(context, 8);
606       operation->weight_width = 2;
607       operation->weight_height = 2;
608       operation->weight_zero_point = 0x0;
609       operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
610       operation->weight_signed = false;
611       operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
612 
613       uint8_t *weight_map = map_resource(operation->weight_tensor);
614       weight_map[0] = compute_weight_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
615 
616       operation->bias_tensor = etna_ml_create_resource(context, 4);
617       int32_t *bias_map = map_resource(operation->bias_tensor);
618       bias_map[0] = compute_bias_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale,
619                                     poperation->input_tensors[1]->zero_point, poperation->input_tensors[0]->zero_point,
620                                     operation->weight_scale);
621    } else {
622       operation->input_channels = 2 * operation->output_channels;
623 
624       operation->weight_tensor = etna_ml_create_resource(context, operation->input_channels * operation->output_channels);
625       operation->weight_width = 1;
626       operation->weight_height = 1;
627       operation->weight_zero_point = 0x0;
628       operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
629       operation->weight_signed = false;
630       operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
631 
632       uint8_t (*weight_map)[operation->input_channels] = map_resource(operation->weight_tensor);
633       memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
634 
635       uint8_t first_weight = compute_weight_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
636       uint8_t second_weight = round((poperation->input_tensors[1]->scale / poperation->input_tensors[0]->scale) / operation->weight_scale);
637 
638       for(unsigned oc = 0; oc < operation->output_channels; oc++) {
639          for(unsigned ic = 0; ic < operation->input_channels; ic++) {
640             if (ic == oc) {
641                weight_map[oc][ic] = first_weight;
642             } else if(ic == operation->output_channels + oc) {
643                weight_map[oc][ic] = second_weight;
644             }
645          }
646       }
647 
648       operation->bias_tensor = etna_ml_create_resource(context, 4 * operation->output_channels);
649       uint32_t *bias_map = map_resource(operation->bias_tensor);
650 
651       int zero_point_diff = poperation->input_tensors[0]->zero_point - poperation->input_tensors[1]->zero_point;
652       double bias = zero_point_diff * poperation->input_tensors[1]->scale;
653       bias /= operation->weight_scale * poperation->input_tensors[0]->scale;
654       for(unsigned oc = 0; oc < operation->output_channels; oc++)
655          bias_map[oc] = (int)round(bias);
656    }
657 }
658 
659 void
etna_ml_lower_fully_connected(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)660 etna_ml_lower_fully_connected(struct etna_ml_subgraph *subgraph,
661                               const struct pipe_ml_operation *poperation,
662                               struct etna_operation *operation)
663 {
664    assert(poperation->type == PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED);
665 
666    operation->type = ETNA_JOB_TYPE_NN;
667    operation->addition = false;
668    operation->depthwise = false;
669    operation->pointwise = false;
670    operation->fully_connected = true;
671    operation->pooling_first_pixel = false;
672    operation->padding_same = false;
673    operation->stride = 1;
674 
675    operation->input_tensors[0] = poperation->input_tensors[0]->index;
676    operation->input_count = 1;
677    operation->input_width = poperation->input_tensors[0]->dims[1];
678    operation->input_height = 1;
679    operation->input_channels = 1;
680    operation->input_zero_point = poperation->input_tensors[0]->zero_point;
681    operation->input_scale = poperation->input_tensors[0]->scale;
682    operation->input_tensor_sizes[0] = operation->input_width *
683                                       operation->input_height *
684                                       operation->input_channels;
685 
686    operation->output_tensors[0] = poperation->output_tensors[0]->index;
687    operation->output_width = 1;
688    operation->output_height = 1;
689    operation->output_channels = poperation->output_tensors[0]->dims[1];
690    operation->output_zero_point = poperation->output_tensors[0]->zero_point;
691    operation->output_scale = poperation->output_tensors[0]->scale;
692    operation->output_tensor_sizes[0] = operation->output_width *
693                                       operation->output_height *
694                                       operation->output_channels;
695 
696    pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
697    operation->weight_width = poperation->conv.weight_tensor->dims[1];
698    operation->weight_height = 1;
699    operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
700    operation->weight_scale = poperation->conv.weight_tensor->scale;
701 
702    pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
703 }
704 
705 void
etna_ml_calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)706 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
707                             unsigned *output_width, unsigned *output_height, unsigned *output_channels)
708 {
709    ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
710 
711    unsigned channel_size = *input_width * *input_height;
712    unsigned width = 0;
713    if (channel_size % 128 == 0)
714       width = 128;
715    else if (channel_size % 64 == 0)
716       width = 64;
717    else if (channel_size % 32 == 0)
718       width = 32;
719    else {
720       for (int i = 63; i > 0; i--) {
721          if (channel_size % i == 0) {
722             width = i;
723             break;
724          }
725       }
726    }
727 
728    *input_height = (*input_width * *input_height * *input_channels) / width;
729    *input_width = width;
730    *input_channels = 2;
731 
732    *output_height = *output_width * *output_height * *output_channels / width;
733    *output_width = width;
734    *output_channels = 1;
735 }
736 
737 static unsigned
etna_ml_calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)738 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
739 {
740    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
741    if (nn_core_version == 7)
742       return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out);
743    else
744       return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out);
745 }
746 
747 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coef_cache_size)748 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coef_cache_size)
749 {
750    struct pipe_context *context = subgraph->base.context;
751    struct etna_context *ctx = etna_context(context);
752    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
753    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
754    unsigned oc_sram_size = etna_ml_get_core_info(ctx)->on_chip_sram_size;
755    struct etna_bo *bo = etna_ml_create_bo(context, sizeof(struct etna_nn_params));
756    unsigned input_width = operation->input_width;
757    unsigned input_height = operation->input_height;
758    unsigned input_channels = operation->input_channels;
759    unsigned output_width = operation->output_width;
760    unsigned output_height = operation->output_height;
761    unsigned output_channels = operation->output_channels;
762    unsigned weight_width = operation->weight_width;
763    unsigned weight_height = operation->weight_height;
764 
765    if (operation->pointwise && input_channels == 1)
766       weight_width = weight_height = 2;
767 
768    if (nn_core_version < 8 && operation->addition) {
769       etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
770                                   &output_width, &output_height, &output_channels);
771    }
772 
773    if (input_height > input_width) {
774       SWAP(input_width, input_height);
775       SWAP(output_width, output_height);
776    }
777 
778    if (operation->fully_connected) {
779       unsigned original_input_width = input_width;
780       input_width = 15;
781       while (original_input_width % input_width)
782          input_width--;
783       unsigned original_input_height = original_input_width / input_width;
784       input_height = 15;
785       while (original_input_height % input_height)
786          input_height--;
787       input_channels = original_input_height / input_height;
788       weight_width = input_width;
789       weight_height = input_height;
790    }
791 
792    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
793 
794    struct etna_nn_params *map = etna_bo_map(bo);
795    map->layer_type = 0x0;
796    map->no_z_offset = nn_core_version == 8;
797    map->prelu = 0x0;
798    map->nn_layer_flush = 0x1;
799    map->brick_mode = 0x0;
800    map->brick_distance = 0x0;
801    map->relu = operation->relu;
802    map->no_flush = nn_core_version == 8;
803    map->rounding_mode = 0x1;
804    map->partial_cache_data_unit = 0x0;
805 
806    if (nn_core_version == 8 && operation->depthwise)
807       map->depthwise = 0x1;
808 
809    map->unused0 = 0x0;
810    map->unused1 = 0x0;
811    map->unused2 = 0x0;
812    map->unused3 = 0x0;
813    map->unused4 = 0x0;
814    map->unused5 = 0x0;
815    map->unused6 = 0x0;
816    map->unused7_0 = 0x0;
817    map->unused7_1 = 0x0;
818    map->unused7_2 = 0x0;
819    map->unused7_3 = 0x0;
820    map->unused8 = 0x0;
821    map->unused9 = 0x0;
822    map->unused10 = 0x0;
823    map->unused11 = 0x0;
824    map->unused12 = 0x0;
825    map->unused13 = 0x0;
826    map->unused14 = 0x0;
827    map->further1 = 0x0;
828    map->further2 = 0x0;
829    map->further3 = 0x3ffffff;
830    map->further4 = 0x7f800000;
831    map->further5 = 0xff800000;
832    map->further6 = 0x0;
833    map->further7 = 0x0;
834    map->further8 = 0x0;
835 
836    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
837    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
838    map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
839    map->in_image_x_size = input_width;
840    map->in_image_y_size = input_height;
841    map->in_image_x_stride = input_width;
842    map->in_image_y_stride = input_height;
843    map->in_image_data_type = ETNA_NN_INT8;
844    map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
845    map->in_image_circular_buf_size = 0x0;
846    map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
847    map->in_image_border_mode = 0x0;
848    map->in_image_border_const = operation->input_zero_point;
849 
850    if (operation->padding_same) {
851       if (operation->stride == 1 && weight_width > 2) {
852 
853          if (weight_width < 5) {
854             map->in_image_x_offset = 0x7;
855             map->in_image_y_offset = 0x7;
856          } else {
857             map->in_image_x_offset = 0x6;
858             map->in_image_y_offset = 0x6;
859          }
860 
861          map->in_image_x_offset_bit_3 = 0x1;
862          map->in_image_y_offset_bit_3 = 0x1;
863          map->unused7_2 = nn_core_version == 8;
864          map->unused7_3 = nn_core_version == 8;
865 
866       } else if (operation->stride == 2 && weight_width > 2 && (input_width < 5 || (operation->depthwise && (weight_width == 5 || input_width == 5)))) {
867 
868          if ((input_width <= 5 && weight_width < 5) ||
869             (input_width > 5 && weight_width >= 5)) {
870             map->in_image_x_offset = 0x7;
871             map->in_image_y_offset = 0x7;
872          } else {
873             map->in_image_x_offset = 0x6;
874             map->in_image_y_offset = 0x6;
875          }
876 
877          map->in_image_x_offset_bit_3 = 0x1;
878          map->in_image_y_offset_bit_3 = 0x1;
879          map->unused7_2 = nn_core_version == 8;
880          map->unused7_3 = nn_core_version == 8;
881       }
882    }
883 
884    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
885    offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
886    map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
887    map->out_image_x_size = output_width;
888    map->out_image_y_size = output_height;
889    map->out_image_z_size = output_channels;
890 
891    map->out_image_x_stride = map->out_image_x_size;
892    map->out_image_y_stride = map->out_image_y_size;
893 
894    map->out_image_data_type = ETNA_NN_INT8;
895    map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
896    map->out_image_circular_buf_size = 0x0;
897    map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
898    map->out_zero_point = operation->output_zero_point;
899 
900    if (operation->pooling_first_pixel) {
901       map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
902       map->pooling_xy_size = 0x0;
903 
904       map->out_image_x_size *= 2;
905       map->out_image_y_size *= 2;
906    } else {
907       map->pooling = ETNA_NN_POOLING_NON;
908       map->pooling_xy_size = 0x1;
909    }
910 
911    unsigned tile_x, tile_y;
912    unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, &tile_x, &tile_y);
913    map->out_image_tile_x_size = tile_x;
914    map->out_image_tile_y_size = tile_y;
915 
916    map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
917    map->kernel_xy_size = weight_width;
918    map->kernel_y_size = weight_height;
919    map->kernel_z_size = input_channels;
920    map->kernel_z_size2 = 0x0;
921    map->kernel_data_type = ETNA_NN_INT8;
922    map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
923    map->kernel_direct_stream_from_VIP_sram = 0x0;
924 
925    map->coef_zero_point = operation->weight_zero_point;
926 
927    map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
928 
929    unsigned image_cache_size;
930    if (superblocks == 1) {
931       /* No point in caching the input image if there is only one iteration */
932       image_cache_size = 0;
933    } else {
934       unsigned in_image_tile_x_size = map->out_image_tile_x_size + weight_width - 1;
935       unsigned in_image_tile_y_size = map->out_image_tile_y_size + weight_width - 1;
936       image_cache_size = in_image_tile_x_size * in_image_tile_y_size;
937       image_cache_size = ALIGN(image_cache_size, 16);
938       image_cache_size *= input_channels;
939       image_cache_size = ALIGN(image_cache_size, 128);
940    }
941 
942    ML_DBG("coefficients_size 0x%x (%d) image_size 0x%x (%d)\n", coef_cache_size, coef_cache_size, image_cache_size, image_cache_size);
943 
944    map->kernel_cache_start_address = 0x800;
945 
946    /* Get all the image tiles in the cache, then use the rest for the kernels */
947    if (map->kernel_cache_start_address + coef_cache_size + image_cache_size < oc_sram_size) {
948       map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
949       map->kernel_pattern_msb = 0x0;
950       map->kernel_pattern_low = 0x0;
951       map->kernel_pattern_high = 0x0;
952       map->kernel_cache_end_address = MAX2(MIN2(ALIGN(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
953    } else {
954       /* Doesn't fit in the 512KB we have of on-chip SRAM */
955       map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
956       if (map->out_image_z_size >= 1024) {
957          map->kernel_pattern_msb = 0x13;
958          map->kernel_pattern_low = 0x80000;
959          map->kernel_pattern_high = 0x0;
960       } else if (map->out_image_z_size >= 512) {
961          map->kernel_pattern_msb = 0x3d;
962          map->kernel_pattern_low = 0x0;
963          map->kernel_pattern_high = 0x2aaaaaa0;
964       } else if (map->out_image_z_size >= 256) {
965          map->kernel_pattern_msb = 0x3e;
966          map->kernel_pattern_low = 0xffffaaaa;
967          map->kernel_pattern_high = 0x7fffffff;
968       } else if (map->out_image_z_size >= 160) {
969          map->kernel_pattern_msb = 0x6;
970          map->kernel_pattern_low = 0x7e;
971          map->kernel_pattern_high = 0x0;
972       } else {
973          map->kernel_pattern_msb = 0x3f;
974          map->kernel_pattern_low = 0xfffffffe;
975          map->kernel_pattern_high = 0xffffffff;
976       }
977       if (map->kernel_cache_start_address + coef_cache_size >= oc_sram_size) {
978          map->kernel_cache_end_address = oc_sram_size;
979          image_cache_size = 0;
980       } else if (image_cache_size > oc_sram_size) {
981          image_cache_size = 0;
982       } else
983          map->kernel_cache_end_address = oc_sram_size - image_cache_size;
984    }
985 
986    if (image_cache_size == 0) {
987       map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
988       map->image_cache_start_address = 0x0;
989       map->image_cache_end_address = 0x800;
990    } else {
991       map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
992       if (image_cache_size >= map->kernel_cache_start_address) {
993          map->image_cache_start_address = map->kernel_cache_end_address;
994          map->image_cache_end_address = MIN2(map->image_cache_start_address + image_cache_size, oc_sram_size);
995          ML_DBG("image_cache_end_address %d image_cache_start_address %d image_cache_size %d oc_sram_size %d\n", map->image_cache_end_address, map->image_cache_start_address, image_cache_size, oc_sram_size);
996       } else {
997          map->image_cache_start_address = 0x0;
998          map->image_cache_end_address = 0x800;
999       }
1000    }
1001 
1002    /* Caching is not supported yet on V8 */
1003    if (nn_core_version == 8) {
1004       map->kernel_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
1005       map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
1006    }
1007 
1008    float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
1009    uint32_t scale_bits = fui(conv_scale);
1010    /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
1011    unsigned shift = 127 + 31 - 32 - (scale_bits >> 23);
1012    if (nn_core_version == 8)
1013       shift += 1;
1014    else
1015       shift += 16;
1016 
1017    /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
1018    map->post_shift = shift & 0x1f;
1019    map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
1020 
1021    /* Multiplies by (multiplier * 2^15) */
1022    if (nn_core_version == 8) {
1023       map->post_multiplier = scale_bits & 0x1;
1024       map->post_multiplier_1_to_6 = (scale_bits >> 1) & 0x3f;
1025       map->post_multiplier_7_to_14 = (scale_bits >> 7) & 0xff;
1026       map->post_multiplier_15_to_22 = (scale_bits >> 15) & 0xff;
1027    } else {
1028       map->post_multiplier = (scale_bits >> 8) & 0x1;
1029       map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
1030       map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
1031    }
1032 
1033    map->per_channel_post_mul = 0x0;
1034 
1035    etna_bo_cpu_fini(bo);
1036 
1037    return bo;
1038 }
1039 
1040 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1041 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1042                              struct etna_vip_instruction *instruction)
1043 {
1044    struct pipe_context *pctx = subgraph->base.context;
1045    struct etna_context *ctx = etna_context(pctx);
1046    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
1047    unsigned coef_cache_size;
1048 
1049    instruction->type = ETNA_JOB_TYPE_NN;
1050 
1051    if (nn_core_version == 7)
1052       instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
1053    else
1054       instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size);
1055 
1056    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
1057    assert(input);
1058    pipe_resource_reference(&instruction->input, input);
1059 
1060    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
1061    assert(output);
1062    pipe_resource_reference(&instruction->output, output);
1063 
1064    instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coef_cache_size);
1065 }
1066 
1067 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1068 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1069                           struct etna_vip_instruction *operation,
1070                           unsigned idx)
1071 {
1072    struct pipe_context *pctx = subgraph->base.context;
1073    struct etna_context *ctx = etna_context(pctx);
1074    struct etna_cmd_stream *stream = ctx->stream;
1075    unsigned offset = idx + 1;
1076    unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1077 
1078    if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL)) {
1079       nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1080       offset = 0;
1081    }
1082 
1083    etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1084    etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1085 
1086    etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1087    etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1088       .bo = operation->configs[0],
1089       .flags = ETNA_RELOC_READ,
1090       .offset = offset,
1091    });
1092    etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1093 }
1094