1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/u_inlines.h"
7
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_nn.h"
12
13 #define ETNA_NN_INT8 0
14
15 #define SRAM_CACHE_MODE_NO_CACHE 0x0
16 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
17 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
18
19 enum pooling_type {
20 ETNA_NN_POOLING_NON,
21 ETNA_NN_POOLING_MAX,
22 ETNA_NN_POOLING_AVG,
23 ETNA_NN_POOLING_FIRST_PIXEL
24 };
25
26 #define FIELD(field, bits) uint32_t field : bits;
27
28 struct etna_nn_params {
29
30 FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
31 FIELD(no_z_offset, 1)
32 FIELD(kernel_xy_size, 4)
33 FIELD(kernel_z_size, 14) /* & 0x3FFF */
34 FIELD(kernels_per_core, 7)
35 FIELD(pooling, 2)
36 FIELD(pooling_xy_size, 1)
37 FIELD(prelu, 1)
38 FIELD(nn_layer_flush, 1)
39
40 /* 1 */
41 FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
42 FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
43 FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44 FIELD(in_image_x_size, 13)
45 FIELD(in_image_y_size, 13)
46
47 /* 2 */
48 FIELD(in_image_x_offset, 3)
49 FIELD(in_image_y_offset, 3)
50 FIELD(unused0, 1)
51 FIELD(brick_mode, 1)
52 FIELD(brick_distance, 16)
53 FIELD(relu, 1)
54 FIELD(unused1, 1)
55 FIELD(post_multiplier, 1)
56 FIELD(post_shift, 5)
57
58 /* 3 */
59 FIELD(unused2, 3)
60 FIELD(no_flush, 1)
61 FIELD(unused3, 2)
62 FIELD(out_image_x_size, 13)
63 FIELD(out_image_y_size, 13)
64
65 /* 4 */
66 /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
67 FIELD(out_image_z_size, 14)
68 FIELD(rounding_mode, 2)
69 FIELD(in_image_x_offset_bit_3, 1) /* >> 3 & 0x1 */
70 FIELD(in_image_y_offset_bit_3, 1) /* >> 3 & 0x1 */
71 FIELD(out_image_tile_x_size, 7)
72 FIELD(out_image_tile_y_size, 7)
73
74 /* 5 */
75 FIELD(kernel_address, 26) /* >> 6 */
76 FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
77
78 /* 6 */
79 FIELD(in_image_address, 32)
80
81 /* 7 */
82 FIELD(out_image_address, 32)
83
84 /* 8 */
85 FIELD(image_caching_mode, 2)
86 FIELD(kernel_caching_mode, 2)
87 FIELD(partial_cache_data_unit, 2)
88 FIELD(kernel_pattern_msb, 6)
89 FIELD(kernel_y_size, 4)
90 FIELD(out_image_y_stride, 16)
91
92 /* 9 */
93 FIELD(kernel_pattern_low, 32)
94
95 /* 10 */
96 FIELD(kernel_pattern_high, 32)
97
98 /* 11 */
99 FIELD(kernel_cache_start_address, 32)
100
101 /* 12 */
102 FIELD(kernel_cache_end_address, 32)
103
104 /* 13 */
105 FIELD(image_cache_start_address, 32)
106
107 /* 14 */
108 FIELD(image_cache_end_address, 32)
109
110 /* 15 */
111 FIELD(in_image_border_mode, 2)
112 FIELD(in_image_border_const, 16)
113 FIELD(unused4, 1)
114 FIELD(kernel_data_type_bit_2, 1)
115 FIELD(in_image_data_type_bit_2, 1)
116 FIELD(out_image_data_type_bit_2, 1)
117 FIELD(post_multiplier_1_to_6, 6)
118 FIELD(post_shift_bit_5_6, 2)
119 FIELD(unused5, 2)
120
121 /* 16 */
122 FIELD(in_image_x_stride, 16)
123 FIELD(in_image_y_stride, 16)
124
125 /* 17 */
126 FIELD(out_image_x_stride, 16)
127 FIELD(unused6, 8)
128 FIELD(post_multiplier_7_to_14, 8)
129
130 /* 18 */
131 FIELD(out_image_circular_buf_size, 26) /* >> 6 */
132 FIELD(unused7, 5)
133 FIELD(per_channel_post_mul, 1)
134
135 /* 19 */
136 FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
137 FIELD(unused8, 6)
138
139 /* 20 */
140 FIELD(in_image_circular_buf_size, 26) /* >> 6 */
141 FIELD(unused9, 6)
142
143 /* 21 */
144 FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
145 FIELD(unused10, 6)
146
147 /* 22 */
148 FIELD(coef_zero_point, 8)
149 FIELD(out_zero_point, 8)
150 FIELD(kernel_direct_stream_from_VIP_sram, 1)
151 FIELD(depthwise, 1)
152 FIELD(unused11, 14)
153
154 /* 23, from here they aren't set on */
155 FIELD(unused12, 32)
156
157 /* 24 */
158 FIELD(unused13, 4)
159 FIELD(unused14, 28) /* 0 >> 4 */
160
161 /* 25 */
162 FIELD(unused15, 4)
163 FIELD(unused16, 28) /* 0 >> 4 */
164
165 /* 26 */
166 FIELD(further1, 32)
167 FIELD(further2, 32)
168 FIELD(further3, 32)
169 FIELD(further4, 32)
170 FIELD(further5, 32)
171 FIELD(further6, 32)
172 FIELD(further7, 32)
173 FIELD(further8, 32)
174 };
175
176 static void *
map_resource(struct pipe_resource * resource)177 map_resource(struct pipe_resource *resource)
178 {
179 return etna_bo_map(etna_resource(resource)->bo);
180 }
181
182
183 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)184 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
185 {
186 /* Fill a Nx2x2xN tensor with zero_points */
187 struct pipe_context *context = subgraph->base.context;
188 uint8_t *input = map_resource(operation->weight_tensor);
189 unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
190 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
191 new_size);
192 uint8_t *output = map_resource(output_res);
193
194 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
195 uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
196 uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
197
198 map_out[0] = map_in[0];
199 map_out[1] = operation->weight_zero_point;
200 map_out[2] = operation->weight_zero_point;
201 map_out[3] = operation->weight_zero_point;
202 }
203
204 pipe_resource_reference(&operation->weight_tensor, NULL);
205 operation->weight_tensor = output_res;
206
207 operation->weight_width = operation->weight_height = 2;
208 operation->pointwise = false;
209 }
210
211 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)212 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
213 {
214 struct pipe_context *context = subgraph->base.context;
215 uint8_t *input = map_resource(operation->weight_tensor);
216 unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
217 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
218 new_size);
219 uint8_t *output = map_resource(output_res);
220
221 /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
222 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
223 unsigned in_channel = channel / operation->output_channels;
224 unsigned in_depth = channel % operation->output_channels;
225
226 uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
227 uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
228
229 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
230 if (i % operation->input_channels == in_depth)
231 map_out[i] = map_in[i];
232 else
233 map_out[i] = operation->weight_zero_point;
234 }
235 }
236
237 pipe_resource_reference(&operation->weight_tensor, NULL);
238 operation->weight_tensor = output_res;
239 }
240
241 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)242 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
243 {
244 struct pipe_context *context = subgraph->base.context;
245 void *map = map_resource(operation->weight_tensor);
246 unsigned new_size = operation->output_channels * operation->weight_width * \
247 operation->weight_height * operation->input_channels;
248 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
249 new_size);
250 uint8_t *output = map_resource(output_res);
251 unsigned output_channels = operation->output_channels;
252 unsigned input_channels = operation->input_channels;
253
254 if (operation->addition) {
255 output_channels = 1;
256 input_channels = 2;
257 }
258
259 uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
260 unsigned i = 0;
261 for (unsigned d0 = 0; d0 < output_channels; d0++)
262 for (unsigned d3 = 0; d3 < input_channels; d3++)
263 for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
264 for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
265 ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
266
267 pipe_resource_reference(&operation->weight_tensor, NULL);
268 operation->weight_tensor = output_res;
269 }
270
271 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)272 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
273 {
274 uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
275 uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
276
277 for(unsigned x = 0; x < out_width; x++)
278 for(unsigned y = 0; y < out_height; y++) {
279 unsigned in_x = x * stride + offset_x;
280 unsigned in_y = y * stride + offset_y;
281 if (in_x < in_width && in_y < in_height)
282 out[x][y] = in[in_x][in_y][in_z];
283 else
284 out[x][y] = in_zp;
285 }
286 }
287
288 /* TODO: Do the reshaping in the TP units, for big enough buffers */
289 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])290 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
291 {
292 for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
293 void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
294 void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
295
296 /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
297 /* This is only valid for stride == 2 */
298 assert(stride == 2);
299 uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
300 for (unsigned z = 0; z < dims_in[3]; z++) {
301 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
302 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
303 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
304 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
305 }
306 }
307 }
308
309 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)310 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
311 {
312 struct pipe_context *context = subgraph->base.context;
313 uint8_t *input = map_resource(operation->weight_tensor);
314 unsigned new_size;
315 struct pipe_resource *output_res;
316 uint8_t *output;
317
318 /* The hardware doesn't support strides natively, so we "lower" them as
319 * described in this paper:
320 *
321 * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
322 */
323
324 /* TODO: Support more strides */
325 assert(operation->stride == 2);
326
327 unsigned wdims_in[4] = {operation->output_channels,
328 operation->weight_width,
329 operation->weight_height,
330 operation->input_channels};
331
332 operation->input_channels = operation->input_channels * operation->stride * operation->stride;
333 operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
334 operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
335
336 if (operation->padding_same) {
337 if (operation->weight_width == 5) {
338 operation->input_width += 2;
339 operation->input_height += 2;
340 } else {
341 operation->input_width += 1;
342 operation->input_height += 1;
343 }
344 }
345
346 operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
347 operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
348
349 new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
350 output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, new_size);
351 output = map_resource(output_res);
352
353 unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
354 reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out);
355
356 pipe_resource_reference(&operation->weight_tensor, NULL);
357 operation->weight_tensor = output_res;
358 }
359
360 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)361 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
362 const struct pipe_ml_operation *poperation,
363 struct etna_operation *operation)
364 {
365 /* TODO: Support stride_x != stride_y */
366 assert(poperation->conv.stride_x == poperation->conv.stride_y);
367 assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
368
369 operation->type = ETNA_JOB_TYPE_NN;
370 operation->addition = false;
371 operation->depthwise = poperation->conv.depthwise;
372 operation->pointwise = poperation->conv.pointwise;
373 operation->pooling_first_pixel = poperation->conv.stride_x > 1 && \
374 (poperation->conv.depthwise || poperation->conv.pointwise);
375 operation->padding_same = poperation->conv.padding_same;
376 operation->stride = poperation->conv.stride_x;
377
378 operation->input_tensor = poperation->input_tensor->index;
379 operation->input_width = poperation->input_tensor->dims[1];
380 operation->input_height = poperation->input_tensor->dims[2];
381 operation->input_channels = poperation->input_tensor->dims[3];
382 operation->input_zero_point = poperation->input_tensor->zero_point;
383 operation->input_scale = poperation->input_tensor->scale;
384
385 operation->output_tensor = poperation->output_tensor->index;
386 operation->output_width = poperation->output_tensor->dims[1];
387 operation->output_height = poperation->output_tensor->dims[2];
388 operation->output_channels = poperation->output_tensor->dims[3];
389 operation->output_zero_point = poperation->output_tensor->zero_point;
390 operation->output_scale = poperation->output_tensor->scale;
391
392 pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
393 operation->weight_width = poperation->conv.weight_tensor->dims[1];
394 operation->weight_height = poperation->conv.weight_tensor->dims[2];
395 operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
396 operation->weight_scale = poperation->conv.weight_tensor->scale;
397
398 pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
399
400 if (operation->pointwise && operation->input_channels == 1)
401 pointwise_to_2x2(subgraph, operation);
402
403 if (operation->depthwise && (operation->output_channels > 1 || operation->stride > 1)) {
404
405 if (operation->input_width < 8 && operation->input_width > 2)
406 operation->pooling_first_pixel = false;
407
408 expand_depthwise(subgraph, operation);
409 }
410
411 if (operation->stride > 1 && !operation->pooling_first_pixel)
412 strided_to_normal(subgraph, operation); /* This will already transpose if input_channels > 1 */
413 else if (operation->input_channels > 1)
414 transpose(subgraph, operation);
415
416 operation->input_tensor_size = operation->input_width *
417 operation->input_height *
418 operation->input_channels;
419 ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
420 }
421
422 static float
compute_weight_scale_add(float input1_scale,float input2_scale)423 compute_weight_scale_add(float input1_scale, float input2_scale)
424 {
425 double scale_ratio = input1_scale / input2_scale;
426
427 return (float) MAX2(scale_ratio, 1.0) / 255.0;
428 }
429
430 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)431 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
432 {
433 double addition_offset = input1_scale / input2_scale;
434 addition_offset /= weight_scale;
435 return round(addition_offset + 0.0) * 1;
436 }
437
438 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)439 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
440 {
441 double weight = 1.0 / weight_scale;
442 return round(weight + 0.0);
443 }
444
445 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)446 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
447 {
448 int zero_point_diff = input2_zp - input1_zp;
449 double bias = zero_point_diff * input1_scale;
450 bias /= weight_scale * input2_scale;
451
452 double addition_offset = input1_scale / input2_scale;
453 addition_offset /= weight_scale;
454 addition_offset = round(addition_offset + 0.0) * 1;
455
456 return (int) (round(bias) - round(addition_offset) * input2_zp);
457 }
458
459 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)460 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
461 const struct pipe_ml_operation *poperation,
462 struct etna_operation *operation)
463 {
464 struct pipe_context *context = subgraph->base.context;
465
466 assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
467
468 operation->addition = true;
469 operation->depthwise = false;
470 operation->pointwise = false;
471 operation->pooling_first_pixel = false;
472 operation->padding_same = false;
473 operation->stride = 1;
474
475 operation->input_tensor = poperation->input_tensor->index;
476 operation->add_input_tensor = poperation->add.input_tensor->index;
477 operation->input_width = poperation->input_tensor->dims[1];
478 operation->input_height = poperation->input_tensor->dims[2];
479 operation->input_channels = poperation->input_tensor->dims[3];
480 operation->input_zero_point = poperation->input_tensor->zero_point;
481 operation->input_scale = poperation->input_tensor->scale;
482 operation->input_tensor_size = operation->input_width *
483 operation->input_height *
484 operation->input_channels *
485 2;
486
487 operation->output_tensor = poperation->output_tensor->index;
488 operation->output_width = poperation->output_tensor->dims[1];
489 operation->output_height = poperation->output_tensor->dims[2];
490 operation->output_channels = poperation->output_tensor->dims[3];
491 operation->output_zero_point = poperation->output_tensor->zero_point;
492 operation->output_scale = poperation->output_tensor->scale;
493
494 operation->weight_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 8);
495 operation->weight_width = 2;
496 operation->weight_height = 2;
497 operation->weight_zero_point = 0x0;
498 operation->weight_scale = compute_weight_scale_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale);
499 operation->addition_offset = compute_addition_offset(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
500
501 uint8_t *weight_map = map_resource(operation->weight_tensor);
502 memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
503 weight_map[0] = compute_weight_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
504
505 operation->bias_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 4);
506 int32_t *bias_map = map_resource(operation->bias_tensor);
507 bias_map[0] = compute_bias_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale,
508 poperation->add.input_tensor->zero_point, poperation->input_tensor->zero_point,
509 operation->weight_scale);
510 }
511
512 #define ACCUM_BUFFER_DEPTH 64
513 #define INPUT_BUFFER_DEPTH 12
514 #define MAX_TILE_WIDTH 64
515
516 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)517 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
518 {
519 unsigned nn_core_count = ctx->screen->specs.nn_core_count;
520 unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count);
521 unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y;
522
523 if (operation->weight_width == 1)
524 foo = MIN2(foo, ACCUM_BUFFER_DEPTH / 3);
525
526 foo = MIN2(foo, kernels_per_core);
527 foo = MIN2(foo, 127);
528
529 kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo);
530 unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count);
531 unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels);
532
533 /* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */
534 while(operation->output_channels % superblocks)
535 superblocks++;
536
537 ML_DBG("superblocks %d\n", superblocks);
538
539 return superblocks;
540 }
541
542 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)543 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
544 {
545 unsigned mode = 8;
546
547 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
548 return 1;
549
550 if (tile_width > MAX_TILE_WIDTH / 2)
551 mode = 1;
552 else if (tile_width > MAX_TILE_WIDTH / 4)
553 mode = 2;
554 else if (tile_width > MAX_TILE_WIDTH / 8)
555 mode = 4;
556
557 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
558 return MIN2(mode, 4);
559
560 return MIN2(mode, 2);
561 }
562
563 static void
calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)564 calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
565 unsigned *output_width, unsigned *output_height, unsigned *output_channels)
566 {
567 ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
568
569 unsigned channel_size = *input_width * *input_height;
570 unsigned width = 0;
571 if (channel_size % 128 == 0)
572 width = 128;
573 else if (channel_size % 64 == 0)
574 width = 64;
575 else if (channel_size % 32 == 0)
576 width = 32;
577 else {
578 for (int i = 63; i > 0; i--) {
579 if (channel_size % i == 0) {
580 width = i;
581 break;
582 }
583 }
584 }
585
586 *input_height = (*input_width * *input_height * *input_channels) / width;
587 *input_width = width;
588 *input_channels = 2;
589
590 *output_height = *output_width * *output_height * *output_channels / width;
591 *output_width = width;
592 *output_channels = 1;
593 }
594
595 static unsigned
calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)596 calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
597 {
598 unsigned input_width = operation->input_width;
599 unsigned input_height = operation->input_height;
600 unsigned input_channels = operation->input_channels;
601 unsigned output_width = operation->output_width;
602 unsigned output_height = operation->output_height;
603 unsigned output_channels = operation->output_channels;
604 unsigned tile_width;
605 unsigned tile_height;
606 unsigned superblocks;
607 unsigned interleave_mode;
608
609 if (operation->addition)
610 calc_addition_sizes(&input_width, &input_height, &input_channels,
611 &output_width, &output_height, &output_channels);
612
613 if (operation->pooling_first_pixel) {
614 output_width *= 2;
615 output_height *= 2;
616 }
617
618 tile_width = MIN2(output_width, 64);
619 interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
620
621 tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1;
622 ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width);
623 tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH);
624 //tile_height = MIN2(tile_height, operation->input_width);
625 tile_height = MIN2(tile_height, output_height);
626
627 if (operation->stride > 1 && tile_height % 2 > 0)
628 tile_height -= 1;
629
630 superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
631 ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks);
632
633 if (tile_width_out)
634 *tile_width_out = tile_width;
635
636 if (tile_height_out)
637 *tile_height_out = tile_height;
638
639 return superblocks;
640 }
641
642 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coefficients_size)643 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coefficients_size)
644 {
645 struct pipe_context *context = subgraph->base.context;
646 struct etna_context *ctx = etna_context(context);
647 unsigned nn_core_count = ctx->screen->specs.nn_core_count;
648 unsigned oc_sram_size = ctx->screen->specs.on_chip_sram_size;
649 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
650 sizeof(struct etna_nn_params),
651 DRM_ETNA_GEM_CACHE_WC);
652 unsigned input_width = operation->input_width;
653 unsigned input_height = operation->input_height;
654 unsigned input_channels = operation->input_channels;
655 unsigned output_width = operation->output_width;
656 unsigned output_height = operation->output_height;
657 unsigned output_channels = operation->output_channels;
658 unsigned weight_width = operation->weight_width;
659 unsigned weight_height = operation->weight_height;
660
661 if (operation->pointwise && input_channels == 1)
662 weight_width = weight_height = 2;
663
664 if (operation->addition)
665 calc_addition_sizes(&input_width, &input_height, &input_channels,
666 &output_width, &output_height, &output_channels);
667
668 unsigned input_size = input_width * input_height * input_channels;
669
670 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
671
672 struct etna_nn_params *map = etna_bo_map(bo);
673 map->layer_type = 0x0;
674 map->no_z_offset = 0x0;
675 map->prelu = 0x0;
676 map->nn_layer_flush = 0x1;
677 map->brick_mode = 0x0;
678 map->brick_distance = 0x0;
679 map->relu = 0x0;
680 map->no_flush = 0x0;
681 map->rounding_mode = 0x1;
682 map->partial_cache_data_unit = 0x0;
683 map->depthwise = 0x0;
684
685 map->unused0 = 0x0;
686 map->unused1 = 0x0;
687 map->unused2 = 0x0;
688 map->unused3 = 0x0;
689 map->unused4 = 0x0;
690 map->unused5 = 0x0;
691 map->unused6 = 0x0;
692 map->unused7 = 0x0;
693 map->unused8 = 0x0;
694 map->unused9 = 0x0;
695 map->unused10 = 0x0;
696 map->unused11 = 0x0;
697 map->unused12 = 0x0;
698 map->unused13 = 0x0;
699 map->unused14 = 0x0;
700 map->further1 = 0x0;
701 map->further2 = 0x0;
702 map->further3 = 0x3ffffff;
703 map->further4 = 0x7f800000;
704 map->further5 = 0xff800000;
705 map->further6 = 0x0;
706 map->further7 = 0x0;
707 map->further8 = 0x0;
708
709 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
710 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
711 map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
712 map->in_image_x_size = input_width;
713 map->in_image_y_size = input_height;
714 map->in_image_x_stride = input_width;
715 map->in_image_y_stride = input_height;
716 map->in_image_data_type = ETNA_NN_INT8;
717 map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
718 map->in_image_circular_buf_size = 0x0;
719 map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
720 map->in_image_border_mode = 0x0;
721 map->in_image_border_const = operation->input_zero_point;
722
723 if (operation->padding_same && operation->stride == 1 && weight_width > 2) {
724 if (weight_width < 5) {
725 map->in_image_x_offset = 0x7;
726 map->in_image_y_offset = 0x7;
727 } else {
728 map->in_image_x_offset = 0x6;
729 map->in_image_y_offset = 0x6;
730 }
731 map->in_image_x_offset_bit_3 = 0x1;
732 map->in_image_y_offset_bit_3 = 0x1;
733 } else {
734 map->in_image_x_offset = 0x0;
735 map->in_image_y_offset = 0x0;
736 map->in_image_x_offset_bit_3 = 0x0;
737 map->in_image_y_offset_bit_3 = 0x0;
738 }
739
740 if (operation->padding_same && operation->stride == 2 && weight_width == 5) {
741 map->in_image_x_offset = 0x7;
742 map->in_image_y_offset = 0x7;
743 map->in_image_x_offset_bit_3 = 0x1;
744 map->in_image_y_offset_bit_3 = 0x1;
745 }
746
747 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
748 offset = etna_ml_get_offset(subgraph, operation->output_tensor);
749 map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
750 map->out_image_x_size = output_width;
751 map->out_image_y_size = output_height;
752 map->out_image_z_size = output_channels;
753
754 map->out_image_x_stride = map->out_image_x_size;
755 map->out_image_y_stride = map->out_image_y_size;
756
757 map->out_image_data_type = ETNA_NN_INT8;
758 map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
759 map->out_image_circular_buf_size = 0x0;
760 map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
761 map->out_zero_point = operation->output_zero_point;
762
763 if (operation->pooling_first_pixel) {
764 map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
765 map->pooling_xy_size = 0x0;
766
767 map->out_image_x_size *= 2;
768 map->out_image_y_size *= 2;
769 } else {
770 map->pooling = ETNA_NN_POOLING_NON;
771 map->pooling_xy_size = 0x1;
772 }
773
774 unsigned tile_x, tile_y;
775 unsigned superblocks = calculate_tiling(ctx, operation, &tile_x, &tile_y);
776 map->out_image_tile_x_size = tile_x;
777 map->out_image_tile_y_size = tile_y;
778
779 map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
780 map->kernel_xy_size = weight_width;
781 map->kernel_y_size = weight_height;
782 map->kernel_z_size = input_channels;
783 map->kernel_z_size2 = 0x0;
784 map->kernel_data_type = ETNA_NN_INT8;
785 map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
786 map->kernel_direct_stream_from_VIP_sram = 0x0;
787
788 map->coef_zero_point = operation->weight_zero_point;
789
790 map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
791
792 /* Should be max accumBufferDepth (64) / zdpNum (3) */
793 //assert(map->kernels_per_core <= (64 / 3));
794
795 /* The header doesn't get cached */
796 coefficients_size -= 64;
797
798 map->kernel_cache_start_address = 0x800;
799 map->kernel_cache_end_address = MAX2(MIN2(map->kernel_cache_start_address + coefficients_size, oc_sram_size), 0x1a00);
800
801 if (output_channels <= 128 || map->kernel_cache_end_address == oc_sram_size) {
802 map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
803 map->image_cache_start_address = 0x0;
804 map->image_cache_end_address = 0x800;
805 } else {
806 map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
807 map->image_cache_start_address = map->kernel_cache_end_address;
808 map->image_cache_end_address = MIN2(map->image_cache_start_address + input_size + 1024, oc_sram_size);
809 }
810
811 /* TODO: Look at re-enabling the image cache again */
812 map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
813 map->image_cache_start_address = 0x0;
814 map->image_cache_end_address = 0x800;
815
816 if (etna_bo_size(coefficients) <= 0x80000 - 0x800) {
817 map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
818 map->kernel_pattern_msb = 0x0;
819 map->kernel_pattern_low = 0x0;
820 map->kernel_pattern_high = 0x0;
821 } else {
822 /* Doesn't fit in the 512KB we have of on-chip SRAM */
823 map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
824 if (map->out_image_z_size >= 1024) {
825 map->kernel_pattern_msb = 0x13;
826 map->kernel_pattern_low = 0x80000;
827 map->kernel_pattern_high = 0x0;
828 } else if (map->out_image_z_size >= 512) {
829 map->kernel_pattern_msb = 0x3d;
830 map->kernel_pattern_low = 0x0;
831 map->kernel_pattern_high = 0x2aaaaaa0;
832 } else if (map->out_image_z_size >= 256) {
833 map->kernel_pattern_msb = 0x3e;
834 map->kernel_pattern_low = 0xffffaaaa;
835 map->kernel_pattern_high = 0x7fffffff;
836 } else if (map->out_image_z_size >= 160) {
837 map->kernel_pattern_msb = 0x6;
838 map->kernel_pattern_low = 0x7e;
839 map->kernel_pattern_high = 0x0;
840 } else {
841 map->kernel_pattern_msb = 0x3f;
842 map->kernel_pattern_low = 0xfffffffe;
843 map->kernel_pattern_high = 0xffffffff;
844 }
845 }
846
847 float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
848 uint32_t scale_bits = fui(conv_scale);
849 /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
850 unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16;
851
852 /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
853 map->post_shift = shift & 0x1f;
854 map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
855
856 /* Multiplies by (multiplier * 2^15) */
857 map->post_multiplier = (scale_bits >> 8) & 0x1;
858 map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
859 map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
860
861 map->per_channel_post_mul = 0x0;
862
863 etna_bo_cpu_fini(bo);
864
865 return bo;
866 }
867
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)868 static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
869 {
870 int32_t correction = 0;
871
872 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
873 correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
874 }
875
876 return correction;
877 }
878
879 static void
write_6_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)880 write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
881 {
882 struct pipe_context *pctx = subgraph->base.context;
883 unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
884 unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
885 uint8_t *input = map_resource(operation->weight_tensor);
886 uint32_t *biases = map_resource(operation->bias_tensor);
887 unsigned out_values_per_channel = operation->output_width * operation->output_height;
888 unsigned stride = MIN2(operation->input_channels, 6);
889 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
890 uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
891
892 ML_DBG("%s\n", __func__);
893
894 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
895
896 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
897 if (superblock == superblocks - 1)
898 kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
899
900 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
901 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
902 weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels;
903 }
904
905 for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) {
906 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
907 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
908
909 if (block == 0) {
910 *map++ = weights_maps[kernel][0];
911
912 uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
913 //fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]);
914 *((uint32_t *)map) = biases[out_channel] - corr;
915 map += sizeof(uint32_t);
916
917 for (int i = 1; i < stride; i++) {
918 *map++ = weights_maps[kernel][i];
919 }
920 } else {
921 for (int i = 0; i < stride; i++) {
922 if (i + block * stride < operation->input_channels)
923 *map++ = weights_maps[kernel][i + block * stride];
924 }
925 }
926 if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) {
927 *((uint32_t*)map) = out_values_per_channel * out_channel;
928 map += sizeof(uint32_t);
929 }
930 }
931 }
932 }
933 }
934
935 static void
write_interleaved_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)936 write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
937 {
938 struct pipe_context *pctx = subgraph->base.context;
939 unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
940 unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
941 uint8_t *input = map_resource(operation->weight_tensor);
942 uint32_t *biases = map_resource(operation->bias_tensor);
943 unsigned out_values_per_channel = operation->output_width * operation->output_height;
944 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
945 uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input;
946
947 ML_DBG("%s core %d\n", __func__, core);
948
949 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
950
951 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
952 if (superblock == superblocks - 1)
953 kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
954
955 for (unsigned z = 0; z < operation->input_channels; z++) {
956 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
957 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
958
959 #if 0
960 if (z == 0)
961 fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n",
962 core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel);
963 #endif
964
965 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
966 unsigned stride = operation->weight_height;
967 if (operation->weight_height > 3)
968 stride = 3;
969 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
970 if (x >= operation->weight_width)
971 break;
972 for (unsigned y = 0; y < stride; y++) {
973 //fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]);
974 *map++ = weights_map[out_channel][z][x][y];
975 if (x == 0 && y == 0 && z == 0) {
976 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
977 //fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]);
978 *((uint32_t *)map) = biases[out_channel] - corr;
979 map += sizeof(uint32_t);
980 }
981 }
982 }
983 if (operation->weight_height > 3) {
984 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
985 if (x >= operation->weight_width)
986 break;
987 for (unsigned y = stride; y < operation->weight_width; y++) {
988 //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]);
989 *map++ = weights_map[out_channel][z][x][y];
990 }
991 }
992 }
993 }
994
995 if (z == operation->input_channels - 1) {
996 *((uint32_t*)map) = out_values_per_channel * out_channel;
997 map += sizeof(uint32_t);
998 }
999 }
1000 }
1001 }
1002 }
1003
1004 static void
write_sequential_weight_format(struct etna_ml_subgraph * subgraph,uint8_t * map,unsigned kernels_per_core,unsigned core,const struct etna_operation * operation)1005 write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
1006 {
1007 struct pipe_context *pctx = subgraph->base.context;
1008 unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
1009 unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
1010 uint8_t *input = map_resource(operation->weight_tensor);
1011 uint32_t *biases = map_resource(operation->bias_tensor);
1012 unsigned out_values_per_channel = operation->output_width * operation->output_height;
1013 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1014
1015 ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels);
1016
1017 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1018
1019 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1020 if (superblock == superblocks - 1)
1021 kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
1022
1023 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1024 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
1025
1026 uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
1027
1028 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1029 unsigned stride = operation->weight_height;
1030 if ((operation->depthwise || operation->input_width > 64) && \
1031 operation->weight_height > 3)
1032 stride = 3;
1033 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1034 if (x >= operation->weight_width)
1035 break;
1036 for (unsigned y = 0; y < stride; y++) {
1037 //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
1038
1039 *map++ = weights_map[x][y];
1040 if (x == 0 && y == 0) {
1041 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
1042 *((uint32_t *)map) = biases[out_channel] - corr;
1043 map += sizeof(uint32_t);
1044 }
1045 }
1046 }
1047 if ((operation->depthwise || operation->input_width > 64) && \
1048 operation->weight_height > 3) {
1049 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1050 if (x >= operation->weight_width)
1051 break;
1052 for (unsigned y = stride; y < operation->weight_width; y++) {
1053 //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
1054 *map++ = weights_map[x][y];
1055 }
1056 }
1057 }
1058 }
1059 if (operation->addition) {
1060 *((uint32_t*)map) = operation->addition_offset;
1061 } else
1062 *((uint32_t*)map) = out_values_per_channel * out_channel;
1063 map += sizeof(uint32_t);
1064 }
1065 }
1066 }
1067
1068 static struct etna_bo *
create_coefficients_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * size)1069 create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size)
1070 {
1071 /* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */
1072 struct pipe_context *context = subgraph->base.context;
1073 struct etna_context *ctx = etna_context(context);
1074 unsigned nn_core_count = ctx->screen->specs.nn_core_count;
1075 unsigned header_size = ALIGN(nn_core_count * 4, 64);
1076 unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */
1077 unsigned input_channels;
1078 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1079 unsigned cores_used = MIN2(output_channels, nn_core_count);
1080 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1081 uint8_t zero_length_encoding = false;
1082 unsigned weights_size;
1083 unsigned core_size;
1084 unsigned core_size_aligned;
1085
1086 input_channels = operation->addition ? 1 : operation->input_channels;
1087 weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size;
1088 core_size = 3 + (weights_size + 4 + 4) * kernels_per_core;
1089 core_size_aligned = ALIGN(core_size, 64);
1090 *size = header_size + core_size_aligned * cores_used;
1091
1092 struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
1093 *size,
1094 DRM_ETNA_GEM_CACHE_WC);
1095
1096 etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
1097
1098 uint8_t *map = etna_bo_map(compressed);
1099 uint32_t *header = (uint32_t *)map;
1100
1101 memset(map, 0, *size);
1102
1103 for (unsigned core = 0; core < cores_used; core++)
1104 header[core] = core_size_aligned;
1105
1106 map += header_size;
1107
1108 #if 0
1109 uint8_t *input = map_resource(operation->weight_tensor);
1110 for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++)
1111 fprintf(stderr, "i %d: %02x\n", i, input[i]);
1112 #endif
1113
1114 for (unsigned core = 0; core < cores_used; core++) {
1115
1116 *map++ = zero_length_encoding;
1117
1118 *((uint16_t *)map) = kernels_per_core;
1119 map += sizeof(uint16_t);
1120
1121 if (operation->pointwise && input_channels >= 1 && output_channels > 8)
1122 write_6_weight_format(subgraph, map, kernels_per_core, core, operation);
1123 else if (input_channels > 1)
1124 write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation);
1125 else
1126 write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation);
1127
1128 map += core_size_aligned - 3;
1129 }
1130
1131 etna_bo_cpu_fini(compressed);
1132
1133 return compressed;
1134 }
1135
1136 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1137 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1138 struct etna_vip_instruction *instruction)
1139 {
1140 unsigned coefficients_size;
1141
1142 instruction->type = ETNA_JOB_TYPE_NN;
1143 instruction->coefficients = create_coefficients_bo(subgraph, operation, &coefficients_size);
1144
1145 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
1146 assert(input);
1147 pipe_resource_reference(&instruction->input, input);
1148
1149 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
1150 assert(output);
1151 pipe_resource_reference(&instruction->output, output);
1152
1153 instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coefficients_size);
1154 }
1155
1156 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1157 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1158 struct etna_vip_instruction *operation,
1159 unsigned idx)
1160 {
1161 struct pipe_context *pctx = subgraph->base.context;
1162 struct etna_context *ctx = etna_context(pctx);
1163 struct etna_cmd_stream *stream = ctx->stream;
1164 unsigned offset = idx + 1;
1165 unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1166
1167 if (DBG_ENABLED(ETNA_DBG_NPU_NO_PARALLEL)) {
1168 nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1169 offset = 0;
1170 }
1171
1172 etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1173 etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1174
1175 etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1176 etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1177 .bo = operation->configs[0],
1178 .flags = ETNA_RELOC_READ,
1179 .offset = offset,
1180 });
1181 etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1182 }
1183