• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include <stdio.h>
7 #include <unistd.h>
8 #include <sys/time.h>
9 
10 #include "util/u_inlines.h"
11 
12 #include "etnaviv_context.h"
13 #include "etnaviv_debug.h"
14 #include "etnaviv_emit.h"
15 #include "etnaviv_ml_nn.h"
16 #include "etnaviv_ml_tp.h"
17 #include "etnaviv_ml.h"
18 
19 struct pipe_resource *
etna_ml_get_tensor(struct etna_ml_subgraph * subgraph,unsigned idx)20 etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx)
21 {
22    return *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx);
23 }
24 
25 unsigned
etna_ml_get_offset(struct etna_ml_subgraph * subgraph,unsigned idx)26 etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx)
27 {
28    return *util_dynarray_element(&subgraph->offsets, unsigned, idx);
29 }
30 
31 unsigned
etna_ml_get_size(struct etna_ml_subgraph * subgraph,unsigned idx)32 etna_ml_get_size(struct etna_ml_subgraph *subgraph, unsigned idx)
33 {
34    return *util_dynarray_element(&subgraph->sizes, unsigned, idx);
35 }
36 
37 unsigned
etna_ml_allocate_tensor(struct etna_ml_subgraph * subgraph)38 etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph)
39 {
40    struct pipe_resource **tensors = util_dynarray_grow(&subgraph->tensors, struct pipe_resource *, 1);
41    tensors[0] = NULL;
42 
43    unsigned *offsets = util_dynarray_grow(&subgraph->offsets, unsigned, 1);
44    offsets[0] = 0;
45 
46    unsigned *sizes = util_dynarray_grow(&subgraph->sizes, unsigned, 1);
47    sizes[0] = 0;
48 
49    return util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *) - 1;
50 }
51 
52 static void
etna_ml_create_tensor(struct etna_ml_subgraph * subgraph,unsigned idx,unsigned size)53 etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned size)
54 {
55    struct pipe_context *context = subgraph->base.context;
56    struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
57    unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
58 
59    assert(idx < util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *));
60 
61    struct pipe_resource *res = tensors[idx];
62 
63    if (res != NULL) {
64       assert(size == sizes[idx]);
65       return;
66    }
67 
68    res = etna_ml_create_resource(context, size);
69    tensors[idx] = res;
70    sizes[idx] = size;
71 
72    ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
73 }
74 
75 static void
etna_ml_destroy_tensor(struct etna_ml_subgraph * subgraph,unsigned idx)76 etna_ml_destroy_tensor(struct etna_ml_subgraph *subgraph, unsigned idx)
77 {
78    struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
79    unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
80    unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
81 
82    pipe_resource_reference(&tensors[idx], NULL);
83    offsets[idx] = 0;
84    sizes[idx] = 0;
85 }
86 
87 struct etna_bo *
etna_ml_create_bo(struct pipe_context * pctx,size_t size)88 etna_ml_create_bo(struct pipe_context *pctx, size_t size)
89 {
90    struct etna_context *ctx = etna_context(pctx);
91    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
92                                     size,
93                                     DRM_ETNA_GEM_CACHE_WC);
94 
95    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
96    struct etna_nn_params *map = etna_bo_map(bo);
97    memset(map, 0, size);
98    etna_bo_cpu_fini(bo);
99 
100    return bo;
101 }
102 
103 struct pipe_resource *
etna_ml_create_resource(struct pipe_context * pctx,size_t size)104 etna_ml_create_resource(struct pipe_context *pctx, size_t size)
105 {
106    struct pipe_resource *res = pipe_buffer_create(pctx->screen, 0, PIPE_USAGE_DEFAULT, size);
107    void *ptr = etna_bo_map(etna_resource(res)->bo);
108    memset(ptr, 0, pipe_buffer_size(res));
109 
110    return res;
111 }
112 
113 struct etna_core_npu_info *
etna_ml_get_core_info(struct etna_context * context)114 etna_ml_get_core_info(struct etna_context *context)
115 {
116    struct etna_screen *screen = context->screen;
117    struct etna_core_info *info = etna_gpu_get_core_info(screen->npu);
118    return &info->npu;
119 }
120 
121 static bool
needs_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation)122 needs_reshuffle(struct etna_ml_subgraph *subgraph, const struct pipe_ml_operation *poperation)
123 {
124    struct pipe_context *context = subgraph->base.context;
125    struct etna_context *ctx = etna_context(context);
126    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
127    bool has_stride = poperation->conv.stride_x > 1 || poperation->conv.stride_y > 1;
128    bool pointwise = poperation->conv.pointwise;
129    unsigned input_width = poperation->input_tensors[0]->dims[1];
130 
131    if (!has_stride)
132       return false;
133 
134    if (nn_core_version < 8)
135       return !(poperation->conv.depthwise && (input_width > 5 || input_width < 3)) && !pointwise;
136    else {
137       unsigned input_channels = poperation->input_tensors[0]->dims[3];
138 
139       if (poperation->conv.depthwise)
140          return false;
141 
142       if (poperation->conv.pointwise && input_width >= 3 && input_channels > 1)
143          return false;
144 
145       if (poperation->conv.pointwise && poperation->conv.padding_same)
146          return false;
147 
148       return true;
149    }
150 }
151 
152 static const struct pipe_ml_operation *
etna_ml_find_producer(const struct pipe_ml_operation * poperations,unsigned count,unsigned tensor_idx)153 etna_ml_find_producer(const struct pipe_ml_operation *poperations,
154                       unsigned count,
155                       unsigned tensor_idx)
156 {
157    for (unsigned i = 0; i < count; i++) {
158       const struct pipe_ml_operation *poperation = &poperations[i];
159 
160       for (unsigned j = 0; j < poperation->output_count; j++)
161          if (poperation->output_tensors[j]->index == tensor_idx)
162             return poperation;
163    }
164 
165    return NULL;
166 }
167 
168 static const struct pipe_ml_operation *
etna_ml_find_consumer(const struct pipe_ml_operation * poperations,unsigned count,unsigned tensor_idx)169 etna_ml_find_consumer(const struct pipe_ml_operation *poperations,
170                       unsigned count,
171                       unsigned tensor_idx)
172 {
173    for (unsigned i = 0; i < count; i++) {
174       const struct pipe_ml_operation *poperation = &poperations[i];
175 
176       for (unsigned j = 0; j < poperation->input_count; j++)
177          if (poperation->input_tensors[j]->index == tensor_idx)
178             return poperation;
179    }
180 
181    return NULL;
182 }
183 
184 static bool
needs_transpose(const struct pipe_ml_operation * poperations,unsigned count,const struct pipe_ml_operation * poperation)185 needs_transpose(const struct pipe_ml_operation *poperations,
186                 unsigned count,
187                 const struct pipe_ml_operation *poperation)
188 {
189    const struct pipe_ml_operation *producer;
190 
191    if (poperation->input_tensors[0]->dims[3] == 1)
192       return false;
193 
194    producer = etna_ml_find_producer(poperations, count, poperation->input_tensors[0]->index);
195    if (!producer)
196       return true;
197 
198    return false;
199 }
200 
201 static bool
needs_detranspose(const struct pipe_ml_operation * poperations,unsigned count,const struct pipe_ml_operation * poperation)202 needs_detranspose(const struct pipe_ml_operation *poperations,
203                   unsigned count,
204                   const struct pipe_ml_operation *poperation)
205 {
206    const struct pipe_ml_operation *consumer;
207 
208    if (poperation->output_tensors[0]->dims[3] == 1)
209       return false;
210 
211    /* TODO: Support multiple consumers */
212    consumer = etna_ml_find_consumer(poperations, count, poperation->output_tensors[0]->index);
213    if (!consumer)
214       return true;
215 
216    return false;
217 }
218 
219 static void
reference_tensor_with_offset(struct etna_ml_subgraph * subgraph,unsigned src_tensor,unsigned dst_tensor,unsigned offset,unsigned size)220 reference_tensor_with_offset(struct etna_ml_subgraph *subgraph,
221                              unsigned src_tensor,
222                              unsigned dst_tensor,
223                              unsigned offset,
224                              unsigned size)
225 {
226    struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
227    unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
228    unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
229    pipe_resource_reference(&tensors[dst_tensor], tensors[src_tensor]);
230    offsets[dst_tensor] = offset;
231    sizes[dst_tensor] = size;
232 }
233 
234 static void
dump_graph(struct list_head * etna_operations)235 dump_graph(struct list_head *etna_operations)
236 {
237    ML_DBG("\n");
238    ML_DBG("dumping intermediate graph: %d operations\n", list_length(etna_operations));
239 
240    ML_DBG("\n");
241    ML_DBG("%3s %-4s %3s %3s  %s\n", "idx", "type", "in", "out", "operation type-specific");
242    ML_DBG("================================================================================================\n");
243    unsigned i = 0;
244    list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
245       switch(operation->type) {
246       case ETNA_JOB_TYPE_TP:
247          ML_DBG("%3d %-4s %3d %3d",
248                 i, "TP", operation->input_tensors[0], operation->output_tensors[0]);
249          break;
250       case ETNA_JOB_TYPE_NN:
251          ML_DBG("%3d %-4s %3d %3d in2: %3d",
252                 i, "NN", operation->input_tensors[0], operation->output_tensors[0], operation->input_tensors[1]);
253          break;
254       case ETNA_JOB_TYPE_CONCAT:
255          ML_DBG("%3d %-4s %3d %3d in2: %3d",
256                 i, "CONC", operation->input_tensors[0], operation->output_tensors[0], operation->input_tensors[1]);
257          break;
258       case ETNA_JOB_TYPE_SPLIT:
259          ML_DBG("%3d %-4s %3d %3d out2: %3d",
260                 i, "SPLIT", operation->input_tensors[0], operation->output_tensors[0], operation->output_tensors[1]);
261          break;
262       }
263       ML_DBG("\n");
264       i++;
265    }
266    ML_DBG("\n");
267 }
268 
269 static void
lower_operations(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperations,unsigned count,struct list_head * etna_operations)270 lower_operations(struct etna_ml_subgraph *subgraph,
271                  const struct pipe_ml_operation *poperations,
272                  unsigned count,
273                  struct list_head *etna_operations)
274 {
275    for (unsigned i = 0; i < count; i++) {
276       const struct pipe_ml_operation *poperation = &poperations[i];
277 
278       switch(poperation->type) {
279          case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
280             unsigned input_tensor = poperation->input_tensors[0]->index;
281 
282             if (needs_transpose(poperations, count, poperation)) {
283                ML_DBG("Adding transpose for convolution operation.\n");
284                struct etna_operation *operation = calloc(1, sizeof(*operation));
285                etna_ml_lower_transpose(subgraph, poperation->input_tensors[0], operation, &input_tensor);
286                list_addtail(&operation->link, etna_operations);
287             }
288 
289             if (needs_reshuffle(subgraph, poperation)) {
290                ML_DBG("Adding reshuffle for convolution operation.\n");
291                struct etna_operation *operation = calloc(1, sizeof(*operation));
292                unsigned temp = 0;
293                etna_ml_lower_reshuffle(subgraph, poperation, operation, &temp);
294                operation->input_tensors[0] = input_tensor;
295                input_tensor = temp;
296                list_addtail(&operation->link, etna_operations);
297             }
298 
299             ML_DBG("Adding convolution.\n");
300             struct etna_operation *operation = calloc(1, sizeof(*operation));
301             etna_ml_lower_convolution(subgraph, poperation, operation);
302             operation->input_tensors[0] = input_tensor;
303             list_addtail(&operation->link, etna_operations);
304 
305             if (needs_detranspose(poperations, count, poperation)) {
306                ML_DBG("Adding detranspose for convolution operation.\n");
307                struct etna_operation *detranspose = calloc(1, sizeof(*operation));
308                etna_ml_lower_detranspose(subgraph, operation, detranspose);
309                operation->output_tensors[0] = detranspose->input_tensors[0];
310                list_addtail(&detranspose->link, etna_operations);
311             }
312             break;
313          }
314          case PIPE_ML_OPERATION_TYPE_ADD: {
315             struct etna_operation *operation = calloc(1, sizeof(*operation));
316             etna_ml_lower_add(subgraph, poperation, operation);
317             list_addtail(&operation->link, etna_operations);
318 
319             if (needs_detranspose(poperations, count, poperation)) {
320                struct etna_operation *detranspose = calloc(1, sizeof(*operation));
321                etna_ml_lower_detranspose(subgraph, operation, detranspose);
322                operation->output_tensors[0] = detranspose->input_tensors[0];
323                list_addtail(&detranspose->link, etna_operations);
324             }
325             break;
326          }
327          case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
328             bool do_transpose = needs_transpose(poperations, count, poperation);
329 
330             struct etna_operation *operation = calloc(1, sizeof(*operation));
331             operation->type = ETNA_JOB_TYPE_CONCAT;
332             assert(poperation->input_count <= MAX_TENSORS);
333             unsigned input_size = 0;
334             for (int i = 0; i < poperation->input_count; i++) {
335                unsigned input_tensor = poperation->input_tensors[i]->index;
336 
337                if (do_transpose) {
338                   struct etna_operation *operation = calloc(1, sizeof(*operation));
339                   etna_ml_lower_transpose(subgraph, poperation->input_tensors[i], operation, &input_tensor);
340                   list_addtail(&operation->link, etna_operations);
341                }
342 
343                operation->input_tensors[i] = input_tensor;
344                operation->input_tensor_sizes[i] = poperation->input_tensors[i]->dims[1] *
345                                                   poperation->input_tensors[i]->dims[2] *
346                                                   poperation->input_tensors[i]->dims[3];
347                input_size += input_size;
348             }
349             operation->input_count = poperation->input_count;
350 
351             operation->output_tensors[0] = poperation->output_tensors[0]->index;
352             operation->output_width = poperation->output_tensors[0]->dims[1];
353             operation->output_height = poperation->output_tensors[0]->dims[2];
354             operation->output_channels = poperation->output_tensors[0]->dims[3];
355             operation->output_tensor_sizes[0] = operation->output_width *
356                                                 operation->output_height *
357                                                 operation->output_channels;
358 
359             list_addtail(&operation->link, etna_operations);
360 
361             if (needs_detranspose(poperations, count, poperation)) {
362                struct etna_operation *detranspose = calloc(1, sizeof(*operation));
363                etna_ml_lower_detranspose(subgraph, operation, detranspose);
364                operation->output_tensors[0] = detranspose->input_tensors[0];
365                list_addtail(&detranspose->link, etna_operations);
366             }
367 
368             break;
369          }
370          case PIPE_ML_OPERATION_TYPE_SPLIT: {
371             struct etna_operation *operation = calloc(1, sizeof(*operation));
372             operation->type = ETNA_JOB_TYPE_SPLIT;
373 
374             operation->input_tensors[0] = poperation->input_tensors[1]->index;
375             operation->input_tensor_sizes[0] = poperation->input_tensors[1]->dims[1] *
376                                                poperation->input_tensors[1]->dims[2] *
377                                                poperation->input_tensors[1]->dims[3];
378 
379             assert(poperation->output_count <= MAX_TENSORS);
380             for (int i = 0; i < poperation->output_count; i++) {
381                operation->output_tensors[i] = poperation->output_tensors[i]->index;
382                operation->output_tensor_sizes[i] = poperation->output_tensors[i]->dims[1] *
383                                                    poperation->output_tensors[i]->dims[2] *
384                                                    poperation->output_tensors[i]->dims[3];
385             }
386             operation->output_count = poperation->output_count;
387 
388             list_addtail(&operation->link, etna_operations);
389 
390             break;
391          }
392          case PIPE_ML_OPERATION_TYPE_PAD: {
393             unsigned input_tensor = poperation->input_tensors[0]->index;
394 
395             if (needs_transpose(poperations, count, poperation)) {
396                struct etna_operation *operation = calloc(1, sizeof(*operation));
397                etna_ml_lower_transpose(subgraph, poperation->input_tensors[0], operation, &input_tensor);
398                list_addtail(&operation->link, etna_operations);
399             }
400 
401             ML_DBG("Adding pad operation.\n");
402             struct etna_operation *operation = calloc(1, sizeof(*operation));
403             etna_ml_lower_pad(subgraph, poperation, operation);
404             operation->input_tensors[0] = input_tensor;
405             list_addtail(&operation->link, etna_operations);
406 
407             if (needs_detranspose(poperations, count, poperation)) {
408                struct etna_operation *detranspose = calloc(1, sizeof(*operation));
409                etna_ml_lower_detranspose(subgraph, operation, detranspose);
410                operation->output_tensors[0] = detranspose->input_tensors[0];
411                list_addtail(&detranspose->link, etna_operations);
412             }
413 
414             break;
415          }
416          case PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED: {
417             struct etna_operation *operation = calloc(1, sizeof(*operation));
418             etna_ml_lower_fully_connected(subgraph, poperation, operation);
419             list_addtail(&operation->link, etna_operations);
420             break;
421          }
422          default:
423             unreachable("Unsupported ML operation type");
424       }
425    }
426 
427    list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
428       if (operation->type == ETNA_JOB_TYPE_CONCAT) {
429          etna_ml_create_tensor(subgraph, operation->output_tensors[0], operation->output_tensor_sizes[0]);
430 
431          unsigned offset = 0;
432          for (int i = 0; i < operation->input_count; i++) {
433             reference_tensor_with_offset(subgraph,
434                                        operation->output_tensors[0],
435                                        operation->input_tensors[i],
436                                        offset,
437                                        operation->input_tensor_sizes[i]);
438             offset += operation->input_tensor_sizes[i];
439          }
440       } else if (operation->type == ETNA_JOB_TYPE_SPLIT) {
441          etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0]);
442 
443          unsigned offset = 0;
444          for (int i = 0; i < operation->output_count; i++) {
445             reference_tensor_with_offset(subgraph,
446                                          operation->input_tensors[0],
447                                          operation->output_tensors[i],
448                                          offset,
449                                          operation->output_tensor_sizes[i]);
450             offset += operation->output_tensor_sizes[i];
451          }
452       } else if (operation->type == ETNA_JOB_TYPE_NN && operation->input_count > 1) { /* Add */
453          etna_ml_destroy_tensor(subgraph, operation->input_tensors[0]);
454          etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0] +
455                                                                       operation->input_tensor_sizes[1]);
456          reference_tensor_with_offset(subgraph,
457                                       operation->input_tensors[0],
458                                       operation->input_tensors[1],
459                                       operation->input_tensor_sizes[0],
460                                       operation->input_tensor_sizes[1]);
461       } else {
462          etna_ml_create_tensor(subgraph, operation->input_tensors[0], operation->input_tensor_sizes[0]);
463       }
464    }
465 
466    /* Create any output tensors that aren't inputs to other operations, these
467     * are the outputs of the graph.
468     */
469    ML_DBG("Ensuring all output tensors have their memory backing.\n");
470    list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
471       struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
472       if (res != NULL)
473          continue;
474 
475       etna_ml_create_tensor(subgraph, operation->output_tensors[0], operation->output_tensor_sizes[0]);
476    }
477 
478    if (DBG_ENABLED(ETNA_DBG_ML_MSGS))
479       dump_graph(etna_operations);
480 }
481 
482 static unsigned
count_tensors(const struct pipe_ml_operation * poperations,unsigned count)483 count_tensors(const struct pipe_ml_operation *poperations,
484               unsigned count)
485 {
486    unsigned tensor_count = 0;
487 
488    for (unsigned i = 0; i < count; i++) {
489       const struct pipe_ml_operation *poperation = &poperations[i];
490 
491       for (unsigned j = 0; j < poperation->input_count; j++)
492          tensor_count = MAX2(tensor_count, poperation->input_tensors[j]->index);
493 
494       for (unsigned j = 0; j < poperation->output_count; j++)
495          tensor_count = MAX2(tensor_count, poperation->output_tensors[j]->index);
496 
497       switch (poperation->type) {
498       case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
499          tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index);
500          tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index);
501          break;
502       case PIPE_ML_OPERATION_TYPE_FULLY_CONNECTED:
503          tensor_count = MAX2(tensor_count, poperation->fcon.weight_tensor->index);
504          tensor_count = MAX2(tensor_count, poperation->fcon.bias_tensor->index);
505          break;
506       case PIPE_ML_OPERATION_TYPE_PAD:
507       case PIPE_ML_OPERATION_TYPE_ADD:
508       case PIPE_ML_OPERATION_TYPE_CONCATENATION:
509       case PIPE_ML_OPERATION_TYPE_SPLIT:
510          break;
511       default:
512          unreachable("Unsupported ML operation type");
513       }
514    }
515 
516    return tensor_count + 1;
517 }
518 
519 struct pipe_ml_subgraph *
etna_ml_subgraph_create(struct pipe_context * pcontext,const struct pipe_ml_operation * poperations,unsigned count)520 etna_ml_subgraph_create(struct pipe_context *pcontext,
521                         const struct pipe_ml_operation *poperations,
522                         unsigned count)
523 {
524    struct etna_context *ctx = etna_context(pcontext);
525    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
526    struct etna_ml_subgraph *subgraph;
527    struct list_head operations;
528    unsigned tensor_count;
529 
530    if (nn_core_count < 1) {
531       fprintf(stderr, "We need at least 1 NN core to do anything useful.\n");
532       abort();
533    }
534 
535    subgraph = calloc(1, sizeof(*subgraph));
536    tensor_count = count_tensors(poperations, count);
537 
538    list_inithead(&operations);
539 
540    subgraph->base.context = pcontext;
541    util_dynarray_init(&subgraph->operations, NULL);
542 
543    util_dynarray_init(&subgraph->tensors, NULL);
544    if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *, tensor_count))
545       return NULL;
546    memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size);
547 
548    util_dynarray_init(&subgraph->offsets, NULL);
549    if (!util_dynarray_resize(&subgraph->offsets, unsigned, tensor_count))
550       return NULL;
551    memset(util_dynarray_begin(&subgraph->offsets), 0, subgraph->offsets.size);
552 
553    util_dynarray_init(&subgraph->sizes, NULL);
554    if (!util_dynarray_resize(&subgraph->sizes, unsigned, tensor_count))
555       return NULL;
556    memset(util_dynarray_begin(&subgraph->sizes), 0, subgraph->sizes.size);
557 
558    lower_operations(subgraph, poperations, count, &operations);
559 
560    list_for_each_entry(struct etna_operation, operation, &operations, link) {
561       struct etna_vip_instruction instruction = {0};
562 
563       switch(operation->type) {
564          case ETNA_JOB_TYPE_NN:
565             etna_ml_compile_operation_nn(subgraph, operation, &instruction);
566             break;
567          case ETNA_JOB_TYPE_TP:
568             etna_ml_compile_operation_tp(subgraph, operation, &instruction);
569             break;
570          case ETNA_JOB_TYPE_CONCAT:
571          case ETNA_JOB_TYPE_SPLIT:
572             continue;
573       }
574 
575       util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction);
576    }
577 
578    list_for_each_entry_safe(struct etna_operation, operation, &operations, link) {
579       pipe_resource_reference(&operation->weight_tensor, NULL);
580       pipe_resource_reference(&operation->bias_tensor, NULL);
581       free(operation);
582    }
583 
584    return &subgraph->base;
585 }
586 
587 static void
dump_buffer(const uint8_t * ptr,char * name,int operation_nr,int suboperation_nr,int offset,unsigned size)588 dump_buffer(const uint8_t *ptr, char *name, int operation_nr, int suboperation_nr, int offset, unsigned size)
589 {
590    char buffer[255];
591 
592    snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr, suboperation_nr);
593 
594    ML_DBG("Dumping buffer from 0x%lx at offset %d with size %d to %s\n", ptr, offset, size, buffer);
595 
596    FILE *f = fopen(buffer, "wb");
597    assert(f);
598    fwrite(ptr + offset, 1, size, f);
599    if(ferror(f)) {
600       ML_DBG("Error in writing to file: %s\n", strerror(errno));
601    }
602    fflush(f);
603    fclose(f);
604 }
605 
606 static void
dump_bo(struct etna_bo * bo,char * name,int operation_nr,int suboperation_nr,int offset,int size)607 dump_bo(struct etna_bo *bo, char *name, int operation_nr, int suboperation_nr, int offset, int size)
608 {
609    const uint8_t *map = etna_bo_map(bo);
610    if (size == 0)
611       size = etna_bo_size(bo) - offset;
612    dump_buffer(map, name, operation_nr, suboperation_nr, offset, size);
613 }
614 
615 static void
init_npu(struct pipe_context * pctx)616 init_npu(struct pipe_context *pctx)
617 {
618    struct etna_context *ctx = etna_context(pctx);
619    struct etna_cmd_stream *stream = ctx->stream;
620 
621    /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
622    etna_cmd_stream_emit(stream, 0x0);
623    etna_cmd_stream_emit(stream, 0x0);
624    etna_cmd_stream_emit(stream, 0x0);
625    etna_cmd_stream_emit(stream, 0x0);
626    etna_cmd_stream_emit(stream, 0x0);
627    etna_cmd_stream_emit(stream, 0x0);
628    etna_cmd_stream_emit(stream, 0x0);
629    etna_cmd_stream_emit(stream, 0x0);
630 
631    etna_set_state(stream, VIVS_PA_SYSTEM_MODE, VIVS_PA_SYSTEM_MODE_PROVOKING_VERTEX_LAST |
632                                                VIVS_PA_SYSTEM_MODE_HALF_PIXEL_CENTER);
633    etna_set_state(stream, VIVS_GL_API_MODE, VIVS_GL_API_MODE_OPENCL);
634 
635    etna_cmd_stream_emit(stream, 0x0);
636    etna_cmd_stream_emit(stream, 0x0);
637 
638    pctx->flush(pctx, NULL, 0);
639 }
640 
641 static void
close_batch(struct pipe_context * pctx)642 close_batch(struct pipe_context *pctx)
643 {
644    struct etna_context *ctx = etna_context(pctx);
645    struct etna_cmd_stream *stream = ctx->stream;
646 
647    unsigned cache = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_UNK10;
648    if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL))
649       cache |= VIVS_GL_FLUSH_CACHE_UNK11 | VIVS_GL_FLUSH_CACHE_SHADER_L1;
650 
651    etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
652    etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
653 
654    etna_cmd_stream_emit(stream, 0x0);
655    etna_cmd_stream_emit(stream, 0x0);
656 
657    ctx->dirty = 0;
658 }
659 
660 void
etna_ml_subgraph_invoke(struct pipe_context * pctx,struct pipe_ml_subgraph * psubgraph,unsigned inputs_count,unsigned input_idxs[],void * inputs[],bool is_signed[])661 etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph,
662                         unsigned inputs_count, unsigned input_idxs[], void *inputs[],
663                         bool is_signed[])
664 {
665    struct etna_context *ctx = etna_context(pctx);
666    unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
667    struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
668    unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
669    unsigned *sizes = util_dynarray_begin(&subgraph->sizes);
670    struct etna_cmd_stream *stream = ctx->stream;
671    static bool is_initialized = false;
672 
673    if (!is_initialized) {
674       init_npu(pctx);
675       is_initialized = true;
676    }
677 
678    if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
679       /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
680       etna_cmd_stream_emit(stream, 0x0);
681       etna_cmd_stream_emit(stream, 0x0);
682       etna_cmd_stream_emit(stream, 0x0);
683       etna_cmd_stream_emit(stream, 0x0);
684       etna_cmd_stream_emit(stream, 0x0);
685       etna_cmd_stream_emit(stream, 0x0);
686       etna_cmd_stream_emit(stream, 0x0);
687       etna_cmd_stream_emit(stream, 0x0);
688    }
689 
690    for (int i = 0; i < inputs_count; i++) {
691       struct pipe_resource *res = etna_ml_get_tensor(subgraph, input_idxs[i]);
692       if (is_signed[i]) {
693          struct pipe_transfer *dst_transfer;
694          const uint8_t *src = inputs[i];
695          uint8_t *dst_map;
696          dst_map = pipe_buffer_map_range(pctx, res, 0, sizes[input_idxs[i]], PIPE_MAP_WRITE, &dst_transfer);
697          assert(dst_map);
698          for (unsigned k = 0; k < sizes[input_idxs[i]]; k++) {
699             dst_map[k] = src[k] + 128;
700          }
701          pipe_buffer_unmap(pctx, dst_transfer);
702       } else {
703          pipe_buffer_write(pctx, res, offsets[input_idxs[i]], sizes[input_idxs[i]], inputs[i]);
704       }
705    }
706 
707    unsigned i = 0;
708    util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
709 
710       if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
711          switch (operation->type) {
712             case ETNA_JOB_TYPE_TP:
713                for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
714                   dump_bo(operation->configs[j], "tp", i, j, 0, 0);
715                }
716                break;
717             case ETNA_JOB_TYPE_NN:
718                dump_bo(operation->configs[0], "nn", i, 0, 0, 0);
719                dump_bo(operation->coefficients, "compressed", i, 0, 0, 0);
720                break;
721             default:
722                unreachable("Unsupported ML operation type");
723          }
724       }
725 
726       if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
727          /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
728          etna_cmd_stream_emit(stream, 0x0);
729          etna_cmd_stream_emit(stream, 0x0);
730          etna_cmd_stream_emit(stream, 0x0);
731          etna_cmd_stream_emit(stream, 0x0);
732          etna_cmd_stream_emit(stream, 0x0);
733          etna_cmd_stream_emit(stream, 0x0);
734          etna_cmd_stream_emit(stream, 0x0);
735          etna_cmd_stream_emit(stream, 0x0);
736       }
737 
738       for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++)
739          etna_cmd_stream_ref_bo(stream, operation->configs[j], ETNA_RELOC_READ);
740       if (operation->coefficients)
741          etna_cmd_stream_ref_bo(stream, operation->coefficients, ETNA_RELOC_READ);
742       etna_cmd_stream_ref_bo(stream, etna_resource(operation->input)->bo, ETNA_RELOC_READ);
743       etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE);
744 
745       switch (operation->type) {
746          case ETNA_JOB_TYPE_TP:
747             etna_ml_emit_operation_tp(subgraph, operation, i);
748             break;
749          case ETNA_JOB_TYPE_NN:
750             etna_ml_emit_operation_nn(subgraph, operation, i);
751             break;
752          default:
753             unreachable("Unsupported ML operation type");
754       }
755 
756       if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
757          ML_DBG("Running operation %d - %d\n", i, operation->type);
758          close_batch(pctx);
759 
760          if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
761             dump_buffer((uint8_t *)ctx->stream->buffer, "cmd", i, 0, 0, ctx->stream->offset * 4);
762 
763          pctx->flush(pctx, NULL, 0);
764 
765          if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
766             struct pipe_transfer *transfer = NULL;
767 
768             pipe_buffer_map(pctx, operation->input, PIPE_MAP_READ, &transfer);
769             dump_bo(etna_resource(operation->input)->bo, "input", i, 0, operation->input_offset, 0);
770             pipe_buffer_unmap(pctx, transfer);
771 
772             pipe_buffer_map(pctx, operation->output, PIPE_MAP_READ, &transfer);
773             dump_bo(etna_resource(operation->output)->bo, "output", i, 0, operation->output_offset, 0);
774             pipe_buffer_unmap(pctx, transfer);
775          }
776 
777          stream = ctx->stream;
778       }
779 
780       i++;
781    }
782 
783    if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING))
784       close_batch(pctx);
785 
786    if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL))
787       pctx->flush(pctx, NULL, 0);
788 }
789 
790 void
etna_ml_subgraph_read_outputs(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph,unsigned outputs_count,unsigned output_idxs[],void * outputs[],bool is_signed[])791 etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph,
792                               unsigned outputs_count, unsigned output_idxs[], void *outputs[],
793                               bool is_signed[])
794 {
795    struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
796    unsigned operation_count = util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction);
797    struct etna_vip_instruction *last_operation;
798 
799    last_operation = util_dynarray_element(&subgraph->operations,
800                                           struct etna_vip_instruction,
801                                           operation_count - 1);
802 
803    if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) {
804       long start, end;
805       struct timespec time;
806 
807       clock_gettime(CLOCK_MONOTONIC, &time);
808       start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
809 
810       context->flush(context, NULL, 0);
811 
812       struct pipe_transfer *transfer = NULL;
813       pipe_buffer_map(context, last_operation->output, PIPE_MAP_READ, &transfer);
814       pipe_buffer_unmap(context, transfer);
815 
816       clock_gettime(CLOCK_MONOTONIC, &time);
817       end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
818       ML_DBG("Running the NN job took %ld ms.\n", (end - start));
819    } else
820       context->flush(context, NULL, 0);
821 
822    for (int i = 0; i < outputs_count; i++) {
823       struct pipe_resource *res = etna_ml_get_tensor(subgraph, output_idxs[i]);
824       if (is_signed[i]) {
825          struct pipe_transfer *src_transfer;
826          uint8_t *src_map;
827          src_map = (uint8_t *) pipe_buffer_map_range(context,
828                                                      res,
829                                                      0, pipe_buffer_size(res),
830                                                      PIPE_MAP_READ,
831                                                      &src_transfer);
832          assert(src_map);
833          for (unsigned k = 0; k < etna_ml_get_size(subgraph, output_idxs[i]); k++) {
834             ((uint8_t *)(outputs[i]))[k] = src_map[k] - 128;
835          }
836          pipe_buffer_unmap(context, src_transfer);
837       } else {
838          pipe_buffer_read(context, res, 0, etna_ml_get_size(subgraph, output_idxs[i]), outputs[i]);
839       }
840    }
841 }
842 
843 void
etna_ml_subgraph_destroy(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph)844 etna_ml_subgraph_destroy(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph)
845 {
846    struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
847 
848    util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
849       for (unsigned j = 0; j < MAX_CONFIG_BOS && operation->configs[j]; j++)
850          etna_bo_del(operation->configs[j]);
851       etna_bo_del(operation->coefficients);
852       pipe_resource_reference(&operation->input, NULL);
853       pipe_resource_reference(&operation->output, NULL);
854    }
855    util_dynarray_fini(&subgraph->operations);
856 
857    util_dynarray_foreach(&subgraph->tensors, struct pipe_resource *, tensor) {
858       pipe_resource_reference(tensor, NULL);
859    }
860    util_dynarray_fini(&subgraph->tensors);
861    util_dynarray_fini(&subgraph->offsets);
862    util_dynarray_fini(&subgraph->sizes);
863 
864    free(subgraph);
865 }
866