1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "pipe/p_state.h"
7 #include "util/u_inlines.h"
8
9 #include "etnaviv_context.h"
10 #include "etnaviv_debug.h"
11 #include "etnaviv_emit.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_nn.h"
14
15 #define ETNA_NN_INT8 0
16
17 #define SRAM_CACHE_MODE_NO_CACHE 0x0
18 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
19 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
20
21 enum pooling_type {
22 ETNA_NN_POOLING_NON,
23 ETNA_NN_POOLING_MAX,
24 ETNA_NN_POOLING_AVG,
25 ETNA_NN_POOLING_FIRST_PIXEL
26 };
27
28 #define FIELD(field, bits) uint32_t field : bits;
29
30 struct etna_nn_params {
31
32 FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
33 FIELD(no_z_offset, 1)
34 FIELD(kernel_xy_size, 4)
35 FIELD(kernel_z_size, 14) /* & 0x3FFF */
36 FIELD(kernels_per_core, 7)
37 FIELD(pooling, 2)
38 FIELD(pooling_xy_size, 1)
39 FIELD(prelu, 1)
40 FIELD(nn_layer_flush, 1)
41
42 /* 1 */
43 FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44 FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
45 FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
46 FIELD(in_image_x_size, 13)
47 FIELD(in_image_y_size, 13)
48
49 /* 2 */
50 FIELD(in_image_x_offset, 3)
51 FIELD(in_image_y_offset, 3)
52 FIELD(unused0, 1)
53 FIELD(brick_mode, 1)
54 FIELD(brick_distance, 16)
55 FIELD(relu, 1)
56 FIELD(unused1, 1)
57 FIELD(post_multiplier, 1)
58 FIELD(post_shift, 5)
59
60 /* 3 */
61 FIELD(unused2, 3)
62 FIELD(no_flush, 1)
63 FIELD(unused3, 2)
64 FIELD(out_image_x_size, 13)
65 FIELD(out_image_y_size, 13)
66
67 /* 4 */
68 /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
69 FIELD(out_image_z_size, 14)
70 FIELD(rounding_mode, 2)
71 FIELD(in_image_x_offset_bit_3, 1) /* >> 3 & 0x1 */
72 FIELD(in_image_y_offset_bit_3, 1) /* >> 3 & 0x1 */
73 FIELD(out_image_tile_x_size, 7)
74 FIELD(out_image_tile_y_size, 7)
75
76 /* 5 */
77 FIELD(kernel_address, 26) /* >> 6 */
78 FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
79
80 /* 6 */
81 FIELD(in_image_address, 32)
82
83 /* 7 */
84 FIELD(out_image_address, 32)
85
86 /* 8 */
87 FIELD(image_caching_mode, 2)
88 FIELD(kernel_caching_mode, 2)
89 FIELD(partial_cache_data_unit, 2)
90 FIELD(kernel_pattern_msb, 6)
91 FIELD(kernel_y_size, 4)
92 FIELD(out_image_y_stride, 16)
93
94 /* 9 */
95 FIELD(kernel_pattern_low, 32)
96
97 /* 10 */
98 FIELD(kernel_pattern_high, 32)
99
100 /* 11 */
101 FIELD(kernel_cache_start_address, 32)
102
103 /* 12 */
104 FIELD(kernel_cache_end_address, 32)
105
106 /* 13 */
107 FIELD(image_cache_start_address, 32)
108
109 /* 14 */
110 FIELD(image_cache_end_address, 32)
111
112 /* 15 */
113 FIELD(in_image_border_mode, 2)
114 FIELD(in_image_border_const, 16)
115 FIELD(unused4, 1)
116 FIELD(kernel_data_type_bit_2, 1)
117 FIELD(in_image_data_type_bit_2, 1)
118 FIELD(out_image_data_type_bit_2, 1)
119 FIELD(post_multiplier_1_to_6, 6)
120 FIELD(post_shift_bit_5_6, 2)
121 FIELD(unused5, 2)
122
123 /* 16 */
124 FIELD(in_image_x_stride, 16)
125 FIELD(in_image_y_stride, 16)
126
127 /* 17 */
128 FIELD(out_image_x_stride, 16)
129 FIELD(unused6, 8)
130 FIELD(post_multiplier_7_to_14, 8)
131
132 /* 18 */
133 FIELD(out_image_circular_buf_size, 26) /* >> 6 */
134 FIELD(per_channel_post_mul, 1)
135 FIELD(unused7_0, 1)
136 FIELD(unused7_1, 1)
137 FIELD(unused7_2, 1)
138 FIELD(unused7_3, 2)
139
140 /* 19 */
141 FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
142 FIELD(unused8, 6)
143
144 /* 20 */
145 FIELD(in_image_circular_buf_size, 26) /* >> 6 */
146 FIELD(unused9, 6)
147
148 /* 21 */
149 FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
150 FIELD(unused10, 6)
151
152 /* 22 */
153 FIELD(coef_zero_point, 8)
154 FIELD(out_zero_point, 8)
155 FIELD(kernel_direct_stream_from_VIP_sram, 1)
156 FIELD(depthwise, 1)
157 FIELD(post_multiplier_15_to_22, 8)
158 FIELD(unused11, 6)
159
160 /* 23, from here they aren't set on */
161 FIELD(unused12, 32)
162
163 /* 24 */
164 FIELD(unused13, 4)
165 FIELD(unused14, 28) /* 0 >> 4 */
166
167 /* 25 */
168 FIELD(unused15, 4)
169 FIELD(unused16, 28) /* 0 >> 4 */
170
171 /* 26 */
172 FIELD(further1, 32)
173 FIELD(further2, 32)
174 FIELD(further3, 32)
175 FIELD(further4, 32)
176 FIELD(further5, 32)
177 FIELD(further6, 32)
178 FIELD(further7, 32)
179 FIELD(further8, 32)
180 };
181
182 static void *
map_resource(struct pipe_resource * resource)183 map_resource(struct pipe_resource *resource)
184 {
185 return etna_bo_map(etna_resource(resource)->bo);
186 }
187
188
189 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)190 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
191 {
192 /* Fill a Nx2x2xN tensor with zero_points */
193 struct pipe_context *context = subgraph->base.context;
194 uint8_t *input = map_resource(operation->weight_tensor);
195 unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
196 struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
197 uint8_t *output = map_resource(output_res);
198
199 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
200 uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
201 uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
202
203 map_out[0] = map_in[0];
204 if (operation->weight_signed) {
205 map_out[1] = operation->weight_zero_point - 128;
206 map_out[2] = operation->weight_zero_point - 128;
207 map_out[3] = operation->weight_zero_point - 128;
208 } else {
209 map_out[1] = operation->weight_zero_point;
210 map_out[2] = operation->weight_zero_point;
211 map_out[3] = operation->weight_zero_point;
212 }
213 }
214
215 pipe_resource_reference(&operation->weight_tensor, NULL);
216 operation->weight_tensor = output_res;
217
218 operation->weight_width = operation->weight_height = 2;
219 operation->pointwise = false;
220 }
221
222 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)223 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
224 {
225 struct pipe_context *context = subgraph->base.context;
226 uint8_t *input = map_resource(operation->weight_tensor);
227 unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
228 struct pipe_resource *output_res = etna_ml_create_resource(context, new_size);
229 uint8_t *output = map_resource(output_res);
230
231 /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
232 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
233 unsigned in_channel = channel / operation->output_channels;
234 unsigned in_depth = channel % operation->output_channels;
235
236 uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
237 uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
238
239 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
240 if (i % operation->input_channels == in_depth)
241 map_out[i] = map_in[i];
242 else if (operation->weight_signed)
243 map_out[i] = operation->weight_zero_point - 128;
244 else
245 map_out[i] = operation->weight_zero_point;
246 }
247 }
248
249 pipe_resource_reference(&operation->weight_tensor, NULL);
250 operation->weight_tensor = output_res;
251 }
252
253 static void
reorder_for_hw_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)254 reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
255 {
256 struct pipe_context *context = subgraph->base.context;
257 uint8_t *input = map_resource(operation->weight_tensor);
258 struct pipe_resource *output_res = etna_ml_create_resource(context, pipe_buffer_size(operation->weight_tensor));
259 uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
260
261 for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
262 unsigned out_channel = i % operation->output_channels;
263
264 output[out_channel][i / operation->output_channels] = input[i];
265 }
266
267 pipe_resource_reference(&operation->weight_tensor, NULL);
268 operation->weight_tensor = output_res;
269 }
270
271 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)272 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
273 {
274 struct pipe_context *context = subgraph->base.context;
275 unsigned nn_core_version = etna_context(context)->screen->specs.nn_core_version;
276 void *map = map_resource(operation->weight_tensor);
277 unsigned new_size;
278 struct pipe_resource *output_res;
279 uint8_t *output;
280 unsigned output_channels = operation->output_channels;
281 unsigned input_channels;
282
283 if (nn_core_version == 8 && operation->depthwise)
284 input_channels = 1;
285 else
286 input_channels = operation->input_channels;
287
288 if (operation->addition) {
289 output_channels = 1;
290 input_channels = 2;
291 }
292
293 new_size = operation->output_channels * operation->weight_width * \
294 operation->weight_height * input_channels;
295 output_res = etna_ml_create_resource(context, new_size);
296 output = map_resource(output_res);
297
298 uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
299 unsigned i = 0;
300 for (unsigned d0 = 0; d0 < output_channels; d0++)
301 for (unsigned d3 = 0; d3 < input_channels; d3++)
302 for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
303 for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
304 ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
305
306 pipe_resource_reference(&operation->weight_tensor, NULL);
307 operation->weight_tensor = output_res;
308 }
309
310 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)311 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
312 {
313 uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
314 uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
315
316 for(unsigned x = 0; x < out_width; x++)
317 for(unsigned y = 0; y < out_height; y++) {
318 unsigned in_x = x * stride + offset_x;
319 unsigned in_y = y * stride + offset_y;
320 if (in_x < in_width && in_y < in_height)
321 out[x][y] = in[in_x][in_y][in_z];
322 else
323 out[x][y] = in_zp;
324 }
325 }
326
327 /* TODO: Do the reshaping in the TP units, for big enough buffers */
328 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])329 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
330 {
331 for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
332 void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
333 void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
334
335 /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
336 /* This is only valid for stride == 2 */
337 assert(stride == 2);
338 uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
339 for (unsigned z = 0; z < dims_in[3]; z++) {
340 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
341 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
342 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
343 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
344 }
345 }
346 }
347
348 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)349 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
350 {
351 struct pipe_context *context = subgraph->base.context;
352 uint8_t *input = map_resource(operation->weight_tensor);
353 unsigned new_size;
354 struct pipe_resource *output_res;
355 uint8_t *output;
356
357 /* The hardware doesn't support strides natively, so we "lower" them as
358 * described in this paper:
359 *
360 * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
361 */
362
363 /* TODO: Support more strides */
364 assert(operation->stride == 2);
365
366 unsigned wdims_in[4] = {operation->output_channels,
367 operation->weight_width,
368 operation->weight_height,
369 operation->input_channels};
370
371 operation->input_channels = operation->input_channels * operation->stride * operation->stride;
372 operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
373 operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
374
375 if (operation->padding_same) {
376 if (operation->weight_width == 5) {
377 operation->input_width += 2;
378 operation->input_height += 2;
379 } else {
380 operation->input_width += 1;
381 operation->input_height += 1;
382 }
383 }
384
385 operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
386 operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
387
388 new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
389 output_res = etna_ml_create_resource(context, new_size);
390 output = map_resource(output_res);
391
392 unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
393 int weight_zero_point = operation->weight_signed ? (operation->weight_zero_point - 128) : operation->weight_zero_point;
394 reshape(input, output, operation->stride, weight_zero_point, wdims_in, wdims_out);
395
396 pipe_resource_reference(&operation->weight_tensor, NULL);
397 operation->weight_tensor = output_res;
398 }
399
400 static bool
calc_pooling_first_pixel(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation)401 calc_pooling_first_pixel(struct etna_ml_subgraph *subgraph,
402 const struct pipe_ml_operation *poperation)
403 {
404 struct pipe_context *context = subgraph->base.context;
405 unsigned nn_core_version = etna_context(context)->screen->specs.nn_core_version;
406 unsigned input_width = poperation->input_tensors[0]->dims[1];
407 unsigned input_channels = poperation->input_tensors[0]->dims[3];
408
409 if (poperation->conv.stride_x == 1)
410 return false;
411
412 if (poperation->conv.depthwise)
413 return true;
414
415 if (nn_core_version < 8) {
416 if (poperation->conv.pointwise)
417 return true;
418 } else {
419 if (poperation->conv.pointwise && input_width >= 3 && input_channels > 1)
420 return true;
421
422 if (poperation->conv.pointwise && poperation->conv.padding_same)
423 return true;
424 }
425
426 return false;
427 }
428
429 static inline uint8_t
etna_tensor_zero_point(struct pipe_tensor * tensor)430 etna_tensor_zero_point(struct pipe_tensor *tensor)
431 {
432 if (tensor->is_signed) {
433 /*
434 * Since the hardware only supports unsigned 8-bit integers, signed
435 * tensors are shifted from the -128..127 range to 0..255 by adding 128
436 * when uploading and subtracting 128 when downloading the tensor.
437 * Tensor zero point and weight coefficients have to be adapted to
438 * account for this.
439 */
440 assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
441 return tensor->zero_point + 128;
442 } else {
443 assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
444 return tensor->zero_point;
445 }
446 }
447
448 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)449 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
450 const struct pipe_ml_operation *poperation,
451 struct etna_operation *operation)
452 {
453 struct pipe_context *context = subgraph->base.context;
454 struct etna_context *ctx = etna_context(context);
455 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
456
457 /* TODO: Support stride_x != stride_y */
458 assert(poperation->conv.stride_x == poperation->conv.stride_y);
459 assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
460
461 operation->type = ETNA_JOB_TYPE_NN;
462 operation->addition = false;
463 operation->depthwise = poperation->conv.depthwise;
464 operation->pointwise = poperation->conv.pointwise;
465 operation->relu = poperation->conv.relu;
466 operation->pooling_first_pixel = calc_pooling_first_pixel(subgraph, poperation);
467 operation->padding_same = poperation->conv.padding_same;
468 operation->stride = poperation->conv.stride_x;
469
470 operation->input_tensors[0] = poperation->input_tensors[0]->index;
471 operation->input_count = 1;
472 operation->input_width = poperation->input_tensors[0]->dims[1];
473 operation->input_height = poperation->input_tensors[0]->dims[2];
474 operation->input_channels = poperation->input_tensors[0]->dims[3];
475 operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
476 operation->input_scale = poperation->input_tensors[0]->scale;
477
478 operation->output_tensors[0] = poperation->output_tensors[0]->index;
479 operation->output_width = poperation->output_tensors[0]->dims[1];
480 operation->output_height = poperation->output_tensors[0]->dims[2];
481 operation->output_channels = poperation->output_tensors[0]->dims[3];
482 operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
483 operation->output_scale = poperation->output_tensors[0]->scale;
484
485 pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
486 operation->weight_width = poperation->conv.weight_tensor->dims[1];
487 operation->weight_height = poperation->conv.weight_tensor->dims[2];
488 operation->weight_zero_point = etna_tensor_zero_point(poperation->conv.weight_tensor);
489 operation->weight_scale = poperation->conv.weight_tensor->scale;
490 operation->weight_signed = poperation->conv.weight_tensor->is_signed;
491
492 pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
493
494 if (operation->pointwise && operation->input_channels == 1)
495 pointwise_to_2x2(subgraph, operation);
496
497 if (operation->depthwise) {
498 if (nn_core_version < 8 && (operation->output_channels > 1 || operation->stride > 1)) {
499 if (operation->input_width < 8 && operation->input_width > 2)
500 operation->pooling_first_pixel = false;
501 expand_depthwise(subgraph, operation);
502 } else if (operation->output_channels > 1)
503 reorder_for_hw_depthwise(subgraph, operation);
504 }
505
506 if (operation->stride > 1 && !operation->pooling_first_pixel)
507 strided_to_normal(subgraph, operation); /* This will already transpose if input_channels > 1 */
508 else if (operation->input_channels > 1)
509 transpose(subgraph, operation);
510
511 operation->input_tensor_sizes[0] = operation->input_width *
512 operation->input_height *
513 operation->input_channels;
514 ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
515
516 operation->output_tensor_sizes[0] = operation->output_width *
517 operation->output_height *
518 operation->output_channels;
519 }
520
521 static float
compute_weight_scale_add(float input1_scale,float input2_scale)522 compute_weight_scale_add(float input1_scale, float input2_scale)
523 {
524 double scale_ratio = input1_scale / input2_scale;
525
526 return (float) MAX2(scale_ratio, 1.0) / 255.0;
527 }
528
529 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)530 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
531 {
532 double addition_offset = input1_scale / input2_scale;
533 addition_offset /= weight_scale;
534 return round(addition_offset + 0.0) * 1;
535 }
536
537 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)538 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
539 {
540 double weight = 1.0 / weight_scale;
541 return round(weight + 0.0);
542 }
543
544 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)545 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
546 {
547 int zero_point_diff = input2_zp - input1_zp;
548 double bias = zero_point_diff * input1_scale;
549 bias /= weight_scale * input2_scale;
550
551 double addition_offset = input1_scale / input2_scale;
552 addition_offset /= weight_scale;
553 addition_offset = round(addition_offset + 0.0) * 1;
554
555 return (int) (round(bias) - round(addition_offset) * input2_zp);
556 }
557
558 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)559 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
560 const struct pipe_ml_operation *poperation,
561 struct etna_operation *operation)
562 {
563 struct pipe_context *context = subgraph->base.context;
564 struct etna_context *ctx = etna_context(context);
565 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
566
567 assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
568
569 operation->type = ETNA_JOB_TYPE_NN;
570 operation->addition = true;
571 operation->depthwise = false;
572 operation->pointwise = false;
573 operation->pooling_first_pixel = false;
574 operation->padding_same = false;
575 operation->stride = 1;
576
577 operation->input_width = poperation->input_tensors[0]->dims[1];
578 operation->input_height = poperation->input_tensors[0]->dims[2];
579 operation->input_channels = poperation->input_tensors[0]->dims[3];
580 operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
581 operation->input_scale = poperation->input_tensors[0]->scale;
582
583 operation->input_tensors[0] = poperation->input_tensors[0]->index;
584 operation->input_tensor_sizes[0] = operation->input_width *
585 operation->input_height *
586 operation->input_channels;
587 operation->input_tensors[1] = poperation->input_tensors[1]->index;
588 operation->input_tensor_sizes[1] = operation->input_width *
589 operation->input_height *
590 operation->input_channels;
591 operation->input_count = 2;
592
593 operation->output_tensors[0] = poperation->output_tensors[0]->index;
594 operation->output_width = poperation->output_tensors[0]->dims[1];
595 operation->output_height = poperation->output_tensors[0]->dims[2];
596 operation->output_channels = poperation->output_tensors[0]->dims[3];
597 operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
598 operation->output_scale = poperation->output_tensors[0]->scale;
599
600 operation->output_tensor_sizes[0] = operation->output_width *
601 operation->output_height *
602 operation->output_channels;
603
604 if (nn_core_version < 8) {
605 operation->weight_tensor = etna_ml_create_resource(context, 8);
606 operation->weight_width = 2;
607 operation->weight_height = 2;
608 operation->weight_zero_point = 0x0;
609 operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
610 operation->weight_signed = false;
611 operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
612
613 uint8_t *weight_map = map_resource(operation->weight_tensor);
614 weight_map[0] = compute_weight_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
615
616 operation->bias_tensor = etna_ml_create_resource(context, 4);
617 int32_t *bias_map = map_resource(operation->bias_tensor);
618 bias_map[0] = compute_bias_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale,
619 poperation->input_tensors[1]->zero_point, poperation->input_tensors[0]->zero_point,
620 operation->weight_scale);
621 } else {
622 operation->input_channels = 2 * operation->output_channels;
623
624 operation->weight_tensor = etna_ml_create_resource(context, operation->input_channels * operation->output_channels);
625 operation->weight_width = 1;
626 operation->weight_height = 1;
627 operation->weight_zero_point = 0x0;
628 operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
629 operation->weight_signed = false;
630 operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
631
632 uint8_t (*weight_map)[operation->input_channels] = map_resource(operation->weight_tensor);
633 memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
634
635 uint8_t first_weight = compute_weight_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
636 uint8_t second_weight = round((poperation->input_tensors[1]->scale / poperation->input_tensors[0]->scale) / operation->weight_scale);
637
638 for(unsigned oc = 0; oc < operation->output_channels; oc++) {
639 for(unsigned ic = 0; ic < operation->input_channels; ic++) {
640 if (ic == oc) {
641 weight_map[oc][ic] = first_weight;
642 } else if(ic == operation->output_channels + oc) {
643 weight_map[oc][ic] = second_weight;
644 }
645 }
646 }
647
648 operation->bias_tensor = etna_ml_create_resource(context, 4 * operation->output_channels);
649 uint32_t *bias_map = map_resource(operation->bias_tensor);
650
651 int zero_point_diff = poperation->input_tensors[0]->zero_point - poperation->input_tensors[1]->zero_point;
652 double bias = zero_point_diff * poperation->input_tensors[1]->scale;
653 bias /= operation->weight_scale * poperation->input_tensors[0]->scale;
654 for(unsigned oc = 0; oc < operation->output_channels; oc++)
655 bias_map[oc] = (int)round(bias);
656 }
657 }
658
659 void
etna_ml_lower_fully_connected(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)660 etna_ml_lower_fully_connected(struct etna_ml_subgraph *subgraph,
661 const struct pipe_ml_operation *poperation,
662 struct etna_operation *operation)
663 {
664 assert(poperation->type == PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED);
665
666 operation->type = ETNA_JOB_TYPE_NN;
667 operation->addition = false;
668 operation->depthwise = false;
669 operation->pointwise = false;
670 operation->fully_connected = true;
671 operation->pooling_first_pixel = false;
672 operation->padding_same = false;
673 operation->stride = 1;
674
675 operation->input_tensors[0] = poperation->input_tensors[0]->index;
676 operation->input_count = 1;
677 operation->input_width = poperation->input_tensors[0]->dims[1];
678 operation->input_height = 1;
679 operation->input_channels = 1;
680 operation->input_zero_point = poperation->input_tensors[0]->zero_point;
681 operation->input_scale = poperation->input_tensors[0]->scale;
682 operation->input_tensor_sizes[0] = operation->input_width *
683 operation->input_height *
684 operation->input_channels;
685
686 operation->output_tensors[0] = poperation->output_tensors[0]->index;
687 operation->output_width = 1;
688 operation->output_height = 1;
689 operation->output_channels = poperation->output_tensors[0]->dims[1];
690 operation->output_zero_point = poperation->output_tensors[0]->zero_point;
691 operation->output_scale = poperation->output_tensors[0]->scale;
692 operation->output_tensor_sizes[0] = operation->output_width *
693 operation->output_height *
694 operation->output_channels;
695
696 pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
697 operation->weight_width = poperation->conv.weight_tensor->dims[1];
698 operation->weight_height = 1;
699 operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
700 operation->weight_scale = poperation->conv.weight_tensor->scale;
701
702 pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
703 }
704
705 void
etna_ml_calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)706 etna_ml_calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
707 unsigned *output_width, unsigned *output_height, unsigned *output_channels)
708 {
709 ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
710
711 unsigned channel_size = *input_width * *input_height;
712 unsigned width = 0;
713 if (channel_size % 128 == 0)
714 width = 128;
715 else if (channel_size % 64 == 0)
716 width = 64;
717 else if (channel_size % 32 == 0)
718 width = 32;
719 else {
720 for (int i = 63; i > 0; i--) {
721 if (channel_size % i == 0) {
722 width = i;
723 break;
724 }
725 }
726 }
727
728 *input_height = (*input_width * *input_height * *input_channels) / width;
729 *input_width = width;
730 *input_channels = 2;
731
732 *output_height = *output_width * *output_height * *output_channels / width;
733 *output_width = width;
734 *output_channels = 1;
735 }
736
737 static unsigned
etna_ml_calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)738 etna_ml_calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
739 {
740 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
741 if (nn_core_version == 7)
742 return etna_ml_calculate_tiling_v7(ctx, operation, tile_width_out, tile_height_out);
743 else
744 return etna_ml_calculate_tiling_v8(ctx, operation, tile_width_out, tile_height_out);
745 }
746
747 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coef_cache_size)748 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coef_cache_size)
749 {
750 struct pipe_context *context = subgraph->base.context;
751 struct etna_context *ctx = etna_context(context);
752 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
753 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
754 unsigned oc_sram_size = etna_ml_get_core_info(ctx)->on_chip_sram_size;
755 struct etna_bo *bo = etna_ml_create_bo(context, sizeof(struct etna_nn_params));
756 unsigned input_width = operation->input_width;
757 unsigned input_height = operation->input_height;
758 unsigned input_channels = operation->input_channels;
759 unsigned output_width = operation->output_width;
760 unsigned output_height = operation->output_height;
761 unsigned output_channels = operation->output_channels;
762 unsigned weight_width = operation->weight_width;
763 unsigned weight_height = operation->weight_height;
764
765 if (operation->pointwise && input_channels == 1)
766 weight_width = weight_height = 2;
767
768 if (nn_core_version < 8 && operation->addition) {
769 etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
770 &output_width, &output_height, &output_channels);
771 }
772
773 if (input_height > input_width) {
774 SWAP(input_width, input_height);
775 SWAP(output_width, output_height);
776 }
777
778 if (operation->fully_connected) {
779 unsigned original_input_width = input_width;
780 input_width = 15;
781 while (original_input_width % input_width)
782 input_width--;
783 unsigned original_input_height = original_input_width / input_width;
784 input_height = 15;
785 while (original_input_height % input_height)
786 input_height--;
787 input_channels = original_input_height / input_height;
788 weight_width = input_width;
789 weight_height = input_height;
790 }
791
792 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
793
794 struct etna_nn_params *map = etna_bo_map(bo);
795 map->layer_type = 0x0;
796 map->no_z_offset = nn_core_version == 8;
797 map->prelu = 0x0;
798 map->nn_layer_flush = 0x1;
799 map->brick_mode = 0x0;
800 map->brick_distance = 0x0;
801 map->relu = operation->relu;
802 map->no_flush = nn_core_version == 8;
803 map->rounding_mode = 0x1;
804 map->partial_cache_data_unit = 0x0;
805
806 if (nn_core_version == 8 && operation->depthwise)
807 map->depthwise = 0x1;
808
809 map->unused0 = 0x0;
810 map->unused1 = 0x0;
811 map->unused2 = 0x0;
812 map->unused3 = 0x0;
813 map->unused4 = 0x0;
814 map->unused5 = 0x0;
815 map->unused6 = 0x0;
816 map->unused7_0 = 0x0;
817 map->unused7_1 = 0x0;
818 map->unused7_2 = 0x0;
819 map->unused7_3 = 0x0;
820 map->unused8 = 0x0;
821 map->unused9 = 0x0;
822 map->unused10 = 0x0;
823 map->unused11 = 0x0;
824 map->unused12 = 0x0;
825 map->unused13 = 0x0;
826 map->unused14 = 0x0;
827 map->further1 = 0x0;
828 map->further2 = 0x0;
829 map->further3 = 0x3ffffff;
830 map->further4 = 0x7f800000;
831 map->further5 = 0xff800000;
832 map->further6 = 0x0;
833 map->further7 = 0x0;
834 map->further8 = 0x0;
835
836 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
837 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
838 map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
839 map->in_image_x_size = input_width;
840 map->in_image_y_size = input_height;
841 map->in_image_x_stride = input_width;
842 map->in_image_y_stride = input_height;
843 map->in_image_data_type = ETNA_NN_INT8;
844 map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
845 map->in_image_circular_buf_size = 0x0;
846 map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
847 map->in_image_border_mode = 0x0;
848 map->in_image_border_const = operation->input_zero_point;
849
850 if (operation->padding_same) {
851 if (operation->stride == 1 && weight_width > 2) {
852
853 if (weight_width < 5) {
854 map->in_image_x_offset = 0x7;
855 map->in_image_y_offset = 0x7;
856 } else {
857 map->in_image_x_offset = 0x6;
858 map->in_image_y_offset = 0x6;
859 }
860
861 map->in_image_x_offset_bit_3 = 0x1;
862 map->in_image_y_offset_bit_3 = 0x1;
863 map->unused7_2 = nn_core_version == 8;
864 map->unused7_3 = nn_core_version == 8;
865
866 } else if (operation->stride == 2 && weight_width > 2 && (input_width < 5 || (operation->depthwise && (weight_width == 5 || input_width == 5)))) {
867
868 if ((input_width <= 5 && weight_width < 5) ||
869 (input_width > 5 && weight_width >= 5)) {
870 map->in_image_x_offset = 0x7;
871 map->in_image_y_offset = 0x7;
872 } else {
873 map->in_image_x_offset = 0x6;
874 map->in_image_y_offset = 0x6;
875 }
876
877 map->in_image_x_offset_bit_3 = 0x1;
878 map->in_image_y_offset_bit_3 = 0x1;
879 map->unused7_2 = nn_core_version == 8;
880 map->unused7_3 = nn_core_version == 8;
881 }
882 }
883
884 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
885 offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
886 map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
887 map->out_image_x_size = output_width;
888 map->out_image_y_size = output_height;
889 map->out_image_z_size = output_channels;
890
891 map->out_image_x_stride = map->out_image_x_size;
892 map->out_image_y_stride = map->out_image_y_size;
893
894 map->out_image_data_type = ETNA_NN_INT8;
895 map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
896 map->out_image_circular_buf_size = 0x0;
897 map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
898 map->out_zero_point = operation->output_zero_point;
899
900 if (operation->pooling_first_pixel) {
901 map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
902 map->pooling_xy_size = 0x0;
903
904 map->out_image_x_size *= 2;
905 map->out_image_y_size *= 2;
906 } else {
907 map->pooling = ETNA_NN_POOLING_NON;
908 map->pooling_xy_size = 0x1;
909 }
910
911 unsigned tile_x, tile_y;
912 unsigned superblocks = etna_ml_calculate_tiling(ctx, operation, &tile_x, &tile_y);
913 map->out_image_tile_x_size = tile_x;
914 map->out_image_tile_y_size = tile_y;
915
916 map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
917 map->kernel_xy_size = weight_width;
918 map->kernel_y_size = weight_height;
919 map->kernel_z_size = input_channels;
920 map->kernel_z_size2 = 0x0;
921 map->kernel_data_type = ETNA_NN_INT8;
922 map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
923 map->kernel_direct_stream_from_VIP_sram = 0x0;
924
925 map->coef_zero_point = operation->weight_zero_point;
926
927 map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
928
929 unsigned image_cache_size;
930 if (superblocks == 1) {
931 /* No point in caching the input image if there is only one iteration */
932 image_cache_size = 0;
933 } else {
934 unsigned in_image_tile_x_size = map->out_image_tile_x_size + weight_width - 1;
935 unsigned in_image_tile_y_size = map->out_image_tile_y_size + weight_width - 1;
936 image_cache_size = in_image_tile_x_size * in_image_tile_y_size;
937 image_cache_size = ALIGN(image_cache_size, 16);
938 image_cache_size *= input_channels;
939 image_cache_size = ALIGN(image_cache_size, 128);
940 }
941
942 ML_DBG("coefficients_size 0x%x (%d) image_size 0x%x (%d)\n", coef_cache_size, coef_cache_size, image_cache_size, image_cache_size);
943
944 map->kernel_cache_start_address = 0x800;
945
946 /* Get all the image tiles in the cache, then use the rest for the kernels */
947 if (map->kernel_cache_start_address + coef_cache_size + image_cache_size < oc_sram_size) {
948 map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
949 map->kernel_pattern_msb = 0x0;
950 map->kernel_pattern_low = 0x0;
951 map->kernel_pattern_high = 0x0;
952 map->kernel_cache_end_address = MAX2(MIN2(ALIGN(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
953 } else {
954 /* Doesn't fit in the 512KB we have of on-chip SRAM */
955 map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
956 if (map->out_image_z_size >= 1024) {
957 map->kernel_pattern_msb = 0x13;
958 map->kernel_pattern_low = 0x80000;
959 map->kernel_pattern_high = 0x0;
960 } else if (map->out_image_z_size >= 512) {
961 map->kernel_pattern_msb = 0x3d;
962 map->kernel_pattern_low = 0x0;
963 map->kernel_pattern_high = 0x2aaaaaa0;
964 } else if (map->out_image_z_size >= 256) {
965 map->kernel_pattern_msb = 0x3e;
966 map->kernel_pattern_low = 0xffffaaaa;
967 map->kernel_pattern_high = 0x7fffffff;
968 } else if (map->out_image_z_size >= 160) {
969 map->kernel_pattern_msb = 0x6;
970 map->kernel_pattern_low = 0x7e;
971 map->kernel_pattern_high = 0x0;
972 } else {
973 map->kernel_pattern_msb = 0x3f;
974 map->kernel_pattern_low = 0xfffffffe;
975 map->kernel_pattern_high = 0xffffffff;
976 }
977 if (map->kernel_cache_start_address + coef_cache_size >= oc_sram_size) {
978 map->kernel_cache_end_address = oc_sram_size;
979 image_cache_size = 0;
980 } else if (image_cache_size > oc_sram_size) {
981 image_cache_size = 0;
982 } else
983 map->kernel_cache_end_address = oc_sram_size - image_cache_size;
984 }
985
986 if (image_cache_size == 0) {
987 map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
988 map->image_cache_start_address = 0x0;
989 map->image_cache_end_address = 0x800;
990 } else {
991 map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
992 if (image_cache_size >= map->kernel_cache_start_address) {
993 map->image_cache_start_address = map->kernel_cache_end_address;
994 map->image_cache_end_address = MIN2(map->image_cache_start_address + image_cache_size, oc_sram_size);
995 ML_DBG("image_cache_end_address %d image_cache_start_address %d image_cache_size %d oc_sram_size %d\n", map->image_cache_end_address, map->image_cache_start_address, image_cache_size, oc_sram_size);
996 } else {
997 map->image_cache_start_address = 0x0;
998 map->image_cache_end_address = 0x800;
999 }
1000 }
1001
1002 /* Caching is not supported yet on V8 */
1003 if (nn_core_version == 8) {
1004 map->kernel_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
1005 map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
1006 }
1007
1008 float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
1009 uint32_t scale_bits = fui(conv_scale);
1010 /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
1011 unsigned shift = 127 + 31 - 32 - (scale_bits >> 23);
1012 if (nn_core_version == 8)
1013 shift += 1;
1014 else
1015 shift += 16;
1016
1017 /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
1018 map->post_shift = shift & 0x1f;
1019 map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
1020
1021 /* Multiplies by (multiplier * 2^15) */
1022 if (nn_core_version == 8) {
1023 map->post_multiplier = scale_bits & 0x1;
1024 map->post_multiplier_1_to_6 = (scale_bits >> 1) & 0x3f;
1025 map->post_multiplier_7_to_14 = (scale_bits >> 7) & 0xff;
1026 map->post_multiplier_15_to_22 = (scale_bits >> 15) & 0xff;
1027 } else {
1028 map->post_multiplier = (scale_bits >> 8) & 0x1;
1029 map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
1030 map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
1031 }
1032
1033 map->per_channel_post_mul = 0x0;
1034
1035 etna_bo_cpu_fini(bo);
1036
1037 return bo;
1038 }
1039
1040 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1041 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1042 struct etna_vip_instruction *instruction)
1043 {
1044 struct pipe_context *pctx = subgraph->base.context;
1045 struct etna_context *ctx = etna_context(pctx);
1046 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
1047 unsigned coef_cache_size;
1048
1049 instruction->type = ETNA_JOB_TYPE_NN;
1050
1051 if (nn_core_version == 7)
1052 instruction->coefficients = etna_ml_create_coeffs_v7(subgraph, operation, &coef_cache_size);
1053 else
1054 instruction->coefficients = etna_ml_create_coeffs_v8(subgraph, operation, &coef_cache_size);
1055
1056 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
1057 assert(input);
1058 pipe_resource_reference(&instruction->input, input);
1059
1060 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
1061 assert(output);
1062 pipe_resource_reference(&instruction->output, output);
1063
1064 instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coef_cache_size);
1065 }
1066
1067 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1068 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1069 struct etna_vip_instruction *operation,
1070 unsigned idx)
1071 {
1072 struct pipe_context *pctx = subgraph->base.context;
1073 struct etna_context *ctx = etna_context(pctx);
1074 struct etna_cmd_stream *stream = ctx->stream;
1075 unsigned offset = idx + 1;
1076 unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1077
1078 if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL)) {
1079 nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1080 offset = 0;
1081 }
1082
1083 etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1084 etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1085
1086 etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1087 etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1088 .bo = operation->configs[0],
1089 .flags = ETNA_RELOC_READ,
1090 .offset = offset,
1091 });
1092 etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1093 }
1094