1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include <stdio.h>
7 #include <unistd.h>
8 #include <sys/time.h>
9
10 #include "util/u_inlines.h"
11
12 #include "etnaviv_context.h"
13 #include "etnaviv_debug.h"
14 #include "etnaviv_emit.h"
15 #include "etnaviv_ml_nn.h"
16 #include "etnaviv_ml_tp.h"
17 #include "etnaviv_ml.h"
18
19 struct pipe_resource *
etna_ml_get_tensor(struct etna_ml_subgraph * subgraph,unsigned idx)20 etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx)
21 {
22 return *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx);
23 }
24
25 unsigned
etna_ml_get_offset(struct etna_ml_subgraph * subgraph,unsigned idx)26 etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx)
27 {
28 return *util_dynarray_element(&subgraph->offsets, unsigned, idx);
29 }
30
31 unsigned
etna_ml_get_size(struct etna_ml_subgraph * subgraph,unsigned idx)32 etna_ml_get_size(struct etna_ml_subgraph *subgraph, unsigned idx)
33 {
34 return *util_dynarray_element(&subgraph->sizes, unsigned, idx);
35 }
36
37 unsigned
etna_ml_allocate_tensor(struct etna_ml_subgraph * subgraph)38 etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph)
39 {
40 struct pipe_resource **tensors = util_dynarray_grow(&subgraph->tensors, struct pipe_resource *, 1);
41 tensors[0] = NULL;
42
43 unsigned *offsets = util_dynarray_grow(&subgraph->offsets, unsigned, 1);
44 offsets[0] = 0;
45
46 unsigned *sizes = util_dynarray_grow(&subgraph->sizes, unsigned, 1);
47 sizes[0] = 0;
48
49 return util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *) - 1;
50 }
51
52 static void
etna_ml_create_tensor(struct etna_ml_subgraph * subgraph,unsigned idx,unsigned size)53 etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned size)
54 {
55 struct pipe_context *context = subgraph->base.context;
56 struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
57 unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
58
59 assert(idx < util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *));
60
61 struct pipe_resource *res = tensors[idx];
62
63 if (res != NULL) {
64 assert(size == sizes[idx]);
65 return;
66 }
67
68 res = etna_ml_create_resource(context, size);
69 tensors[idx] = res;
70 sizes[idx] = size;
71
72 ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
73 }
74
75 static void
etna_ml_destroy_tensor(struct etna_ml_subgraph * subgraph,unsigned idx)76 etna_ml_destroy_tensor(struct etna_ml_subgraph *subgraph, unsigned idx)
77 {
78 struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
79 unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
80 unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
81
82 pipe_resource_reference(&tensors[idx], NULL);
83 offsets[idx] = 0;
84 sizes[idx] = 0;
85 }
86
87 struct etna_bo *
etna_ml_create_bo(struct pipe_context * pctx,size_t size)88 etna_ml_create_bo(struct pipe_context *pctx, size_t size)
89 {
90 struct etna_context *ctx = etna_context(pctx);
91 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
92 size,
93 DRM_ETNA_GEM_CACHE_WC);
94
95 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
96 struct etna_nn_params *map = etna_bo_map(bo);
97 memset(map, 0, size);
98 etna_bo_cpu_fini(bo);
99
100 return bo;
101 }
102
103 struct pipe_resource *
etna_ml_create_resource(struct pipe_context * pctx,size_t size)104 etna_ml_create_resource(struct pipe_context *pctx, size_t size)
105 {
106 struct pipe_resource *res = pipe_buffer_create(pctx->screen, 0, PIPE_USAGE_DEFAULT, size);
107 void *ptr = etna_bo_map(etna_resource(res)->bo);
108 memset(ptr, 0, pipe_buffer_size(res));
109
110 return res;
111 }
112
113 struct etna_core_npu_info *
etna_ml_get_core_info(struct etna_context * context)114 etna_ml_get_core_info(struct etna_context *context)
115 {
116 struct etna_screen *screen = context->screen;
117 struct etna_core_info *info = etna_gpu_get_core_info(screen->npu);
118 return &info->npu;
119 }
120
121 static bool
needs_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation)122 needs_reshuffle(struct etna_ml_subgraph *subgraph, const struct pipe_ml_operation *poperation)
123 {
124 struct pipe_context *context = subgraph->base.context;
125 struct etna_context *ctx = etna_context(context);
126 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
127 bool has_stride = poperation->conv.stride_x > 1 || poperation->conv.stride_y > 1;
128 bool pointwise = poperation->conv.pointwise;
129 unsigned input_width = poperation->input_tensors[0]->dims[1];
130
131 if (!has_stride)
132 return false;
133
134 if (nn_core_version < 8)
135 return !(poperation->conv.depthwise && (input_width > 5 || input_width < 3)) && !pointwise;
136 else {
137 unsigned input_channels = poperation->input_tensors[0]->dims[3];
138
139 if (poperation->conv.depthwise)
140 return false;
141
142 if (poperation->conv.pointwise && input_width >= 3 && input_channels > 1)
143 return false;
144
145 if (poperation->conv.pointwise && poperation->conv.padding_same)
146 return false;
147
148 return true;
149 }
150 }
151
152 static const struct pipe_ml_operation *
etna_ml_find_producer(const struct pipe_ml_operation * poperations,unsigned count,unsigned tensor_idx)153 etna_ml_find_producer(const struct pipe_ml_operation *poperations,
154 unsigned count,
155 unsigned tensor_idx)
156 {
157 for (unsigned i = 0; i < count; i++) {
158 const struct pipe_ml_operation *poperation = &poperations[i];
159
160 for (unsigned j = 0; j < poperation->output_count; j++)
161 if (poperation->output_tensors[j]->index == tensor_idx)
162 return poperation;
163 }
164
165 return NULL;
166 }
167
168 static const struct pipe_ml_operation *
etna_ml_find_consumer(const struct pipe_ml_operation * poperations,unsigned count,unsigned tensor_idx)169 etna_ml_find_consumer(const struct pipe_ml_operation *poperations,
170 unsigned count,
171 unsigned tensor_idx)
172 {
173 for (unsigned i = 0; i < count; i++) {
174 const struct pipe_ml_operation *poperation = &poperations[i];
175
176 for (unsigned j = 0; j < poperation->input_count; j++)
177 if (poperation->input_tensors[j]->index == tensor_idx)
178 return poperation;
179 }
180
181 return NULL;
182 }
183
184 static bool
needs_transpose(const struct pipe_ml_operation * poperations,unsigned count,const struct pipe_ml_operation * poperation)185 needs_transpose(const struct pipe_ml_operation *poperations,
186 unsigned count,
187 const struct pipe_ml_operation *poperation)
188 {
189 const struct pipe_ml_operation *producer;
190
191 if (poperation->input_tensors[0]->dims[3] == 1)
192 return false;
193
194 producer = etna_ml_find_producer(poperations, count, poperation->input_tensors[0]->index);
195 if (!producer)
196 return true;
197
198 return false;
199 }
200
201 static bool
needs_detranspose(const struct pipe_ml_operation * poperations,unsigned count,const struct pipe_ml_operation * poperation)202 needs_detranspose(const struct pipe_ml_operation *poperations,
203 unsigned count,
204 const struct pipe_ml_operation *poperation)
205 {
206 const struct pipe_ml_operation *consumer;
207
208 if (poperation->output_tensors[0]->dims[3] == 1)
209 return false;
210
211 /* TODO: Support multiple consumers */
212 consumer = etna_ml_find_consumer(poperations, count, poperation->output_tensors[0]->index);
213 if (!consumer)
214 return true;
215
216 return false;
217 }
218
219 static void
reference_tensor_with_offset(struct etna_ml_subgraph * subgraph,unsigned src_tensor,unsigned dst_tensor,unsigned offset,unsigned size)220 reference_tensor_with_offset(struct etna_ml_subgraph *subgraph,
221 unsigned src_tensor,
222 unsigned dst_tensor,
223 unsigned offset,
224 unsigned size)
225 {
226 struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
227 unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
228 unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
229 pipe_resource_reference(&tensors[dst_tensor], tensors[src_tensor]);
230 offsets[dst_tensor] = offset;
231 sizes[dst_tensor] = size;
232 }
233
234 static void
dump_graph(struct list_head * etna_operations)235 dump_graph(struct list_head *etna_operations)
236 {
237 ML_DBG("\n");
238 ML_DBG("dumping intermediate graph: %d operations\n", list_length(etna_operations));
239
240 ML_DBG("\n");
241 ML_DBG("%3s %-4s %3s %3s %s\n", "idx", "type", "in", "out", "operation type-specific");
242 ML_DBG("================================================================================================\n");
243 unsigned i = 0;
244 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
245 switch(operation->type) {
246 case ETNA_JOB_TYPE_TP:
247 ML_DBG("%3d %-4s %3d %3d",
248 i, "TP", operation->input_tensors[0], operation->output_tensors[0]);
249 break;
250 case ETNA_JOB_TYPE_NN:
251 ML_DBG("%3d %-4s %3d %3d in2: %3d",
252 i, "NN", operation->input_tensors[0], operation->output_tensors[0], operation->input_tensors[1]);
253 break;
254 case ETNA_JOB_TYPE_CONCAT:
255 ML_DBG("%3d %-4s %3d %3d in2: %3d",
256 i, "CONC", operation->input_tensors[0], operation->output_tensors[0], operation->input_tensors[1]);
257 break;
258 case ETNA_JOB_TYPE_SPLIT:
259 ML_DBG("%3d %-4s %3d %3d out2: %3d",
260 i, "SPLIT", operation->input_tensors[0], operation->output_tensors[0], operation->output_tensors[1]);
261 break;
262 }
263 ML_DBG("\n");
264 i++;
265 }
266 ML_DBG("\n");
267 }
268
269 static void
lower_operations(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperations,unsigned count,struct list_head * etna_operations)270 lower_operations(struct etna_ml_subgraph *subgraph,
271 const struct pipe_ml_operation *poperations,
272 unsigned count,
273 struct list_head *etna_operations)
274 {
275 for (unsigned i = 0; i < count; i++) {
276 const struct pipe_ml_operation *poperation = &poperations[i];
277
278 switch(poperation->type) {
279 case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
280 unsigned input_tensor = poperation->input_tensors[0]->index;
281
282 if (needs_transpose(poperations, count, poperation)) {
283 ML_DBG("Adding transpose for convolution operation.\n");
284 struct etna_operation *operation = calloc(1, sizeof(*operation));
285 etna_ml_lower_transpose(subgraph, poperation->input_tensors[0], operation, &input_tensor);
286 list_addtail(&operation->link, etna_operations);
287 }
288
289 if (needs_reshuffle(subgraph, poperation)) {
290 ML_DBG("Adding reshuffle for convolution operation.\n");
291 struct etna_operation *operation = calloc(1, sizeof(*operation));
292 unsigned temp = 0;
293 etna_ml_lower_reshuffle(subgraph, poperation, operation, &temp);
294 operation->input_tensors[0] = input_tensor;
295 input_tensor = temp;
296 list_addtail(&operation->link, etna_operations);
297 }
298
299 ML_DBG("Adding convolution.\n");
300 struct etna_operation *operation = calloc(1, sizeof(*operation));
301 etna_ml_lower_convolution(subgraph, poperation, operation);
302 operation->input_tensors[0] = input_tensor;
303 list_addtail(&operation->link, etna_operations);
304
305 if (needs_detranspose(poperations, count, poperation)) {
306 ML_DBG("Adding detranspose for convolution operation.\n");
307 struct etna_operation *detranspose = calloc(1, sizeof(*operation));
308 etna_ml_lower_detranspose(subgraph, operation, detranspose);
309 operation->output_tensors[0] = detranspose->input_tensors[0];
310 list_addtail(&detranspose->link, etna_operations);
311 }
312 break;
313 }
314 case PIPE_ML_OPERATION_TYPE_ADD: {
315 struct etna_operation *operation = calloc(1, sizeof(*operation));
316 etna_ml_lower_add(subgraph, poperation, operation);
317 list_addtail(&operation->link, etna_operations);
318
319 if (needs_detranspose(poperations, count, poperation)) {
320 struct etna_operation *detranspose = calloc(1, sizeof(*operation));
321 etna_ml_lower_detranspose(subgraph, operation, detranspose);
322 operation->output_tensors[0] = detranspose->input_tensors[0];
323 list_addtail(&detranspose->link, etna_operations);
324 }
325 break;
326 }
327 case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
328 bool do_transpose = needs_transpose(poperations, count, poperation);
329
330 struct etna_operation *operation = calloc(1, sizeof(*operation));
331 operation->type = ETNA_JOB_TYPE_CONCAT;
332 assert(poperation->input_count <= MAX_TENSORS);
333 unsigned input_size = 0;
334 for (int i = 0; i < poperation->input_count; i++) {
335 unsigned input_tensor = poperation->input_tensors[i]->index;
336
337 if (do_transpose) {
338 struct etna_operation *operation = calloc(1, sizeof(*operation));
339 etna_ml_lower_transpose(subgraph, poperation->input_tensors[i], operation, &input_tensor);
340 list_addtail(&operation->link, etna_operations);
341 }
342
343 operation->input_tensors[i] = input_tensor;
344 operation->input_tensor_sizes[i] = poperation->input_tensors[i]->dims[1] *
345 poperation->input_tensors[i]->dims[2] *
346 poperation->input_tensors[i]->dims[3];
347 input_size += input_size;
348 }
349 operation->input_count = poperation->input_count;
350
351 operation->output_tensors[0] = poperation->output_tensors[0]->index;
352 operation->output_width = poperation->output_tensors[0]->dims[1];
353 operation->output_height = poperation->output_tensors[0]->dims[2];
354 operation->output_channels = poperation->output_tensors[0]->dims[3];
355 operation->output_tensor_sizes[0] = operation->output_width *
356 operation->output_height *
357 operation->output_channels;
358
359 list_addtail(&operation->link, etna_operations);
360
361 if (needs_detranspose(poperations, count, poperation)) {
362 struct etna_operation *detranspose = calloc(1, sizeof(*operation));
363 etna_ml_lower_detranspose(subgraph, operation, detranspose);
364 operation->output_tensors[0] = detranspose->input_tensors[0];
365 list_addtail(&detranspose->link, etna_operations);
366 }
367
368 break;
369 }
370 case PIPE_ML_OPERATION_TYPE_SPLIT: {
371 struct etna_operation *operation = calloc(1, sizeof(*operation));
372 operation->type = ETNA_JOB_TYPE_SPLIT;
373
374 operation->input_tensors[0] = poperation->input_tensors[1]->index;
375 operation->input_tensor_sizes[0] = poperation->input_tensors[1]->dims[1] *
376 poperation->input_tensors[1]->dims[2] *
377 poperation->input_tensors[1]->dims[3];
378
379 assert(poperation->output_count <= MAX_TENSORS);
380 for (int i = 0; i < poperation->output_count; i++) {
381 operation->output_tensors[i] = poperation->output_tensors[i]->index;
382 operation->output_tensor_sizes[i] = poperation->output_tensors[i]->dims[1] *
383 poperation->output_tensors[i]->dims[2] *
384 poperation->output_tensors[i]->dims[3];
385 }
386 operation->output_count = poperation->output_count;
387
388 list_addtail(&operation->link, etna_operations);
389
390 break;
391 }
392 case PIPE_ML_OPERATION_TYPE_PAD: {
393 unsigned input_tensor = poperation->input_tensors[0]->index;
394
395 if (needs_transpose(poperations, count, poperation)) {
396 struct etna_operation *operation = calloc(1, sizeof(*operation));
397 etna_ml_lower_transpose(subgraph, poperation->input_tensors[0], operation, &input_tensor);
398 list_addtail(&operation->link, etna_operations);
399 }
400
401 ML_DBG("Adding pad operation.\n");
402 struct etna_operation *operation = calloc(1, sizeof(*operation));
403 etna_ml_lower_pad(subgraph, poperation, operation);
404 operation->input_tensors[0] = input_tensor;
405 list_addtail(&operation->link, etna_operations);
406
407 if (needs_detranspose(poperations, count, poperation)) {
408 struct etna_operation *detranspose = calloc(1, sizeof(*operation));
409 etna_ml_lower_detranspose(subgraph, operation, detranspose);
410 operation->output_tensors[0] = detranspose->input_tensors[0];
411 list_addtail(&detranspose->link, etna_operations);
412 }
413
414 break;
415 }
416 case PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED: {
417 struct etna_operation *operation = calloc(1, sizeof(*operation));
418 etna_ml_lower_fully_connected(subgraph, poperation, operation);
419 list_addtail(&operation->link, etna_operations);
420 break;
421 }
422 default:
423 unreachable("Unsupported ML operation type");
424 }
425 }
426
427 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
428 if (operation->type == ETNA_JOB_TYPE_CONCAT) {
429 etna_ml_create_tensor(subgraph, operation->output_tensors[0], operation->output_tensor_sizes[0]);
430
431 unsigned offset = 0;
432 for (int i = 0; i < operation->input_count; i++) {
433 reference_tensor_with_offset(subgraph,
434 operation->output_tensors[0],
435 operation->input_tensors[i],
436 offset,
437 operation->input_tensor_sizes[i]);
438 offset += operation->input_tensor_sizes[i];
439 }
440 } else if (operation->type == ETNA_JOB_TYPE_SPLIT) {
441 etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0]);
442
443 unsigned offset = 0;
444 for (int i = 0; i < operation->output_count; i++) {
445 reference_tensor_with_offset(subgraph,
446 operation->input_tensors[0],
447 operation->output_tensors[i],
448 offset,
449 operation->output_tensor_sizes[i]);
450 offset += operation->output_tensor_sizes[i];
451 }
452 } else if (operation->type == ETNA_JOB_TYPE_NN && operation->input_count > 1) { /* Add */
453 etna_ml_destroy_tensor(subgraph, operation->input_tensors[0]);
454 etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0] +
455 operation->input_tensor_sizes[1]);
456 reference_tensor_with_offset(subgraph,
457 operation->input_tensors[0],
458 operation->input_tensors[1],
459 operation->input_tensor_sizes[0],
460 operation->input_tensor_sizes[1]);
461 } else {
462 etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0]);
463 }
464 }
465
466 /* Create any output tensors that aren't inputs to other operations, these
467 * are the outputs of the graph.
468 */
469 ML_DBG("Ensuring all output tensors have their memory backing.\n");
470 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
471 struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
472 if (res != NULL)
473 continue;
474
475 etna_ml_create_tensor(subgraph, operation->output_tensors[0], operation->output_tensor_sizes[0]);
476 }
477
478 if (DBG_ENABLED(ETNA_DBG_ML_MSGS))
479 dump_graph(etna_operations);
480 }
481
482 static unsigned
count_tensors(const struct pipe_ml_operation * poperations,unsigned count)483 count_tensors(const struct pipe_ml_operation *poperations,
484 unsigned count)
485 {
486 unsigned tensor_count = 0;
487
488 for (unsigned i = 0; i < count; i++) {
489 const struct pipe_ml_operation *poperation = &poperations[i];
490
491 for (unsigned j = 0; j < poperation->input_count; j++)
492 tensor_count = MAX2(tensor_count, poperation->input_tensors[j]->index);
493
494 for (unsigned j = 0; j < poperation->output_count; j++)
495 tensor_count = MAX2(tensor_count, poperation->output_tensors[j]->index);
496
497 switch (poperation->type) {
498 case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
499 tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index);
500 tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index);
501 break;
502 case PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED:
503 tensor_count = MAX2(tensor_count, poperation->fcon.weight_tensor->index);
504 tensor_count = MAX2(tensor_count, poperation->fcon.bias_tensor->index);
505 break;
506 case PIPE_ML_OPERATION_TYPE_PAD:
507 case PIPE_ML_OPERATION_TYPE_ADD:
508 case PIPE_ML_OPERATION_TYPE_CONCATENATION:
509 case PIPE_ML_OPERATION_TYPE_SPLIT:
510 break;
511 default:
512 unreachable("Unsupported ML operation type");
513 }
514 }
515
516 return tensor_count + 1;
517 }
518
519 struct pipe_ml_subgraph *
etna_ml_subgraph_create(struct pipe_context * pcontext,const struct pipe_ml_operation * poperations,unsigned count)520 etna_ml_subgraph_create(struct pipe_context *pcontext,
521 const struct pipe_ml_operation *poperations,
522 unsigned count)
523 {
524 struct etna_context *ctx = etna_context(pcontext);
525 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
526 struct etna_ml_subgraph *subgraph;
527 struct list_head operations;
528 unsigned tensor_count;
529
530 if (nn_core_count < 1) {
531 fprintf(stderr, "We need at least 1 NN core to do anything useful.\n");
532 abort();
533 }
534
535 subgraph = calloc(1, sizeof(*subgraph));
536 tensor_count = count_tensors(poperations, count);
537
538 list_inithead(&operations);
539
540 subgraph->base.context = pcontext;
541 util_dynarray_init(&subgraph->operations, NULL);
542
543 util_dynarray_init(&subgraph->tensors, NULL);
544 if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *, tensor_count))
545 return NULL;
546 memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size);
547
548 util_dynarray_init(&subgraph->offsets, NULL);
549 if (!util_dynarray_resize(&subgraph->offsets, unsigned, tensor_count))
550 return NULL;
551 memset(util_dynarray_begin(&subgraph->offsets), 0, subgraph->offsets.size);
552
553 util_dynarray_init(&subgraph->sizes, NULL);
554 if (!util_dynarray_resize(&subgraph->sizes, unsigned, tensor_count))
555 return NULL;
556 memset(util_dynarray_begin(&subgraph->sizes), 0, subgraph->sizes.size);
557
558 lower_operations(subgraph, poperations, count, &operations);
559
560 list_for_each_entry(struct etna_operation, operation, &operations, link) {
561 struct etna_vip_instruction instruction = {0};
562
563 switch(operation->type) {
564 case ETNA_JOB_TYPE_NN:
565 etna_ml_compile_operation_nn(subgraph, operation, &instruction);
566 break;
567 case ETNA_JOB_TYPE_TP:
568 etna_ml_compile_operation_tp(subgraph, operation, &instruction);
569 break;
570 case ETNA_JOB_TYPE_CONCAT:
571 case ETNA_JOB_TYPE_SPLIT:
572 continue;
573 }
574
575 util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction);
576 }
577
578 list_for_each_entry_safe(struct etna_operation, operation, &operations, link) {
579 pipe_resource_reference(&operation->weight_tensor, NULL);
580 pipe_resource_reference(&operation->bias_tensor, NULL);
581 free(operation);
582 }
583
584 return &subgraph->base;
585 }
586
587 static void
dump_buffer(const uint8_t * ptr,char * name,int operation_nr,int suboperation_nr,int offset,unsigned size)588 dump_buffer(const uint8_t *ptr, char *name, int operation_nr, int suboperation_nr, int offset, unsigned size)
589 {
590 char buffer[255];
591
592 snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr, suboperation_nr);
593
594 ML_DBG("Dumping buffer from 0x%lx at offset %d with size %d to %s\n", ptr, offset, size, buffer);
595
596 FILE *f = fopen(buffer, "wb");
597 assert(f);
598 fwrite(ptr + offset, 1, size, f);
599 if(ferror(f)) {
600 ML_DBG("Error in writing to file: %s\n", strerror(errno));
601 }
602 fflush(f);
603 fclose(f);
604 }
605
606 static void
dump_bo(struct etna_bo * bo,char * name,int operation_nr,int suboperation_nr,int offset,int size)607 dump_bo(struct etna_bo *bo, char *name, int operation_nr, int suboperation_nr, int offset, int size)
608 {
609 const uint8_t *map = etna_bo_map(bo);
610 if (size == 0)
611 size = etna_bo_size(bo) - offset;
612 dump_buffer(map, name, operation_nr, suboperation_nr, offset, size);
613 }
614
615 static void
init_npu(struct pipe_context * pctx)616 init_npu(struct pipe_context *pctx)
617 {
618 struct etna_context *ctx = etna_context(pctx);
619 struct etna_cmd_stream *stream = ctx->stream;
620
621 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
622 etna_cmd_stream_emit(stream, 0x0);
623 etna_cmd_stream_emit(stream, 0x0);
624 etna_cmd_stream_emit(stream, 0x0);
625 etna_cmd_stream_emit(stream, 0x0);
626 etna_cmd_stream_emit(stream, 0x0);
627 etna_cmd_stream_emit(stream, 0x0);
628 etna_cmd_stream_emit(stream, 0x0);
629 etna_cmd_stream_emit(stream, 0x0);
630
631 etna_set_state(stream, VIVS_PA_SYSTEM_MODE, VIVS_PA_SYSTEM_MODE_PROVOKING_VERTEX_LAST |
632 VIVS_PA_SYSTEM_MODE_HALF_PIXEL_CENTER);
633 etna_set_state(stream, VIVS_GL_API_MODE, VIVS_GL_API_MODE_OPENCL);
634
635 etna_cmd_stream_emit(stream, 0x0);
636 etna_cmd_stream_emit(stream, 0x0);
637
638 pctx->flush(pctx, NULL, 0);
639 }
640
641 static void
close_batch(struct pipe_context * pctx)642 close_batch(struct pipe_context *pctx)
643 {
644 struct etna_context *ctx = etna_context(pctx);
645 struct etna_cmd_stream *stream = ctx->stream;
646
647 unsigned cache = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_UNK10;
648 if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL))
649 cache |= VIVS_GL_FLUSH_CACHE_UNK11 | VIVS_GL_FLUSH_CACHE_SHADER_L1;
650
651 etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
652 etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
653
654 etna_cmd_stream_emit(stream, 0x0);
655 etna_cmd_stream_emit(stream, 0x0);
656
657 ctx->dirty = 0;
658 }
659
660 void
etna_ml_subgraph_invoke(struct pipe_context * pctx,struct pipe_ml_subgraph * psubgraph,unsigned inputs_count,unsigned input_idxs[],void * inputs[],bool is_signed[])661 etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph,
662 unsigned inputs_count, unsigned input_idxs[], void *inputs[],
663 bool is_signed[])
664 {
665 struct etna_context *ctx = etna_context(pctx);
666 unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
667 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
668 unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
669 unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
670 struct etna_cmd_stream *stream = ctx->stream;
671 static bool is_initialized = false;
672
673 if (!is_initialized) {
674 init_npu(pctx);
675 is_initialized = true;
676 }
677
678 if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
679 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
680 etna_cmd_stream_emit(stream, 0x0);
681 etna_cmd_stream_emit(stream, 0x0);
682 etna_cmd_stream_emit(stream, 0x0);
683 etna_cmd_stream_emit(stream, 0x0);
684 etna_cmd_stream_emit(stream, 0x0);
685 etna_cmd_stream_emit(stream, 0x0);
686 etna_cmd_stream_emit(stream, 0x0);
687 etna_cmd_stream_emit(stream, 0x0);
688 }
689
690 for (int i = 0; i < inputs_count; i++) {
691 struct pipe_resource *res = etna_ml_get_tensor(subgraph, input_idxs[i]);
692 if (is_signed[i]) {
693 struct pipe_transfer *dst_transfer;
694 const uint8_t *src = inputs[i];
695 uint8_t *dst_map;
696 dst_map = pipe_buffer_map_range(pctx, res, 0, sizes[input_idxs[i]], PIPE_MAP_WRITE, &dst_transfer);
697 assert(dst_map);
698 for (unsigned k = 0; k < sizes[input_idxs[i]]; k++) {
699 dst_map[k] = src[k] + 128;
700 }
701 pipe_buffer_unmap(pctx, dst_transfer);
702 } else {
703 pipe_buffer_write(pctx, res, offsets[input_idxs[i]], sizes[input_idxs[i]], inputs[i]);
704 }
705 }
706
707 unsigned i = 0;
708 util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
709
710 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
711 switch (operation->type) {
712 case ETNA_JOB_TYPE_TP:
713 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
714 dump_bo(operation->configs[j], "tp", i, j, 0, 0);
715 }
716 break;
717 case ETNA_JOB_TYPE_NN:
718 dump_bo(operation->configs[0], "nn", i, 0, 0, 0);
719 dump_bo(operation->coefficients, "compressed", i, 0, 0, 0);
720 break;
721 default:
722 unreachable("Unsupported ML operation type");
723 }
724 }
725
726 if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
727 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
728 etna_cmd_stream_emit(stream, 0x0);
729 etna_cmd_stream_emit(stream, 0x0);
730 etna_cmd_stream_emit(stream, 0x0);
731 etna_cmd_stream_emit(stream, 0x0);
732 etna_cmd_stream_emit(stream, 0x0);
733 etna_cmd_stream_emit(stream, 0x0);
734 etna_cmd_stream_emit(stream, 0x0);
735 etna_cmd_stream_emit(stream, 0x0);
736 }
737
738 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++)
739 etna_cmd_stream_ref_bo(stream, operation->configs[j], ETNA_RELOC_READ);
740 if (operation->coefficients)
741 etna_cmd_stream_ref_bo(stream, operation->coefficients, ETNA_RELOC_READ);
742 etna_cmd_stream_ref_bo(stream, etna_resource(operation->input)->bo, ETNA_RELOC_READ);
743 etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE);
744
745 switch (operation->type) {
746 case ETNA_JOB_TYPE_TP:
747 etna_ml_emit_operation_tp(subgraph, operation, i);
748 break;
749 case ETNA_JOB_TYPE_NN:
750 etna_ml_emit_operation_nn(subgraph, operation, i);
751 break;
752 default:
753 unreachable("Unsupported ML operation type");
754 }
755
756 if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
757 ML_DBG("Running operation %d - %d\n", i, operation->type);
758 close_batch(pctx);
759
760 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
761 dump_buffer((uint8_t *)ctx->stream->buffer, "cmd", i, 0, 0, ctx->stream->offset * 4);
762
763 pctx->flush(pctx, NULL, 0);
764
765 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
766 struct pipe_transfer *transfer = NULL;
767
768 pipe_buffer_map(pctx, operation->input, PIPE_MAP_READ, &transfer);
769 dump_bo(etna_resource(operation->input)->bo, "input", i, 0, operation->input_offset, 0);
770 pipe_buffer_unmap(pctx, transfer);
771
772 pipe_buffer_map(pctx, operation->output, PIPE_MAP_READ, &transfer);
773 dump_bo(etna_resource(operation->output)->bo, "output", i, 0, operation->output_offset, 0);
774 pipe_buffer_unmap(pctx, transfer);
775 }
776
777 stream = ctx->stream;
778 }
779
780 i++;
781 }
782
783 if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING))
784 close_batch(pctx);
785
786 if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL))
787 pctx->flush(pctx, NULL, 0);
788 }
789
790 void
etna_ml_subgraph_read_outputs(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph,unsigned outputs_count,unsigned output_idxs[],void * outputs[],bool is_signed[])791 etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph,
792 unsigned outputs_count, unsigned output_idxs[], void *outputs[],
793 bool is_signed[])
794 {
795 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
796 unsigned operation_count = util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction);
797 struct etna_vip_instruction *last_operation;
798
799 last_operation = util_dynarray_element(&subgraph->operations,
800 struct etna_vip_instruction,
801 operation_count - 1);
802
803 if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) {
804 long start, end;
805 struct timespec time;
806
807 clock_gettime(CLOCK_MONOTONIC, &time);
808 start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
809
810 context->flush(context, NULL, 0);
811
812 struct pipe_transfer *transfer = NULL;
813 pipe_buffer_map(context, last_operation->output, PIPE_MAP_READ, &transfer);
814 pipe_buffer_unmap(context, transfer);
815
816 clock_gettime(CLOCK_MONOTONIC, &time);
817 end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
818 ML_DBG("Running the NN job took %ld ms.\n", (end - start));
819 } else
820 context->flush(context, NULL, 0);
821
822 for (int i = 0; i < outputs_count; i++) {
823 struct pipe_resource *res = etna_ml_get_tensor(subgraph, output_idxs[i]);
824 if (is_signed[i]) {
825 struct pipe_transfer *src_transfer;
826 uint8_t *src_map;
827 src_map = (uint8_t *) pipe_buffer_map_range(context,
828 res,
829 0, pipe_buffer_size(res),
830 PIPE_MAP_READ,
831 &src_transfer);
832 assert(src_map);
833 for (unsigned k = 0; k < etna_ml_get_size(subgraph, output_idxs[i]); k++) {
834 ((uint8_t *)(outputs[i]))[k] = src_map[k] - 128;
835 }
836 pipe_buffer_unmap(context, src_transfer);
837 } else {
838 pipe_buffer_read(context, res, 0, etna_ml_get_size(subgraph, output_idxs[i]), outputs[i]);
839 }
840 }
841 }
842
843 void
etna_ml_subgraph_destroy(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph)844 etna_ml_subgraph_destroy(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph)
845 {
846 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
847
848 util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
849 for (unsigned j = 0; j < MAX_CONFIG_BOS && operation->configs[j]; j++)
850 etna_bo_del(operation->configs[j]);
851 etna_bo_del(operation->coefficients);
852 pipe_resource_reference(&operation->input, NULL);
853 pipe_resource_reference(&operation->output, NULL);
854 }
855 util_dynarray_fini(&subgraph->operations);
856
857 util_dynarray_foreach(&subgraph->tensors, struct pipe_resource *, tensor) {
858 pipe_resource_reference(tensor, NULL);
859 }
860 util_dynarray_fini(&subgraph->tensors);
861 util_dynarray_fini(&subgraph->offsets);
862 util_dynarray_fini(&subgraph->sizes);
863
864 free(subgraph);
865 }
866