• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "hw/state.xml.h"
7 #include "util/u_inlines.h"
8 
9 #include "etnaviv_context.h"
10 #include "etnaviv_debug.h"
11 #include "etnaviv_emit.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_tp.h"
14 
15 #define FIELD(field, bits) uint32_t field : bits;
16 
17 struct etna_tp_params {
18    /* 0 */
19    FIELD(in_image_x_size, 16)
20    FIELD(unused0, 16)
21 
22    /* 1 */
23    FIELD(in_image_y_size, 16)
24    FIELD(in_image_z_size, 16)
25 
26    /* 2 */
27    FIELD(in_image_stride, 16)
28    FIELD(unused1, 16)
29 
30    /* 3 */
31    FIELD(in_image_slice, 32)
32 
33    /* 4 */
34    FIELD(in_window_x_start, 16)
35    FIELD(in_window_y_start, 16)
36 
37    /* 5 */
38    FIELD(in_window_x_end, 16)
39    FIELD(in_window_y_end, 16)
40 
41    /* 6 */
42    FIELD(in_tile_sequence, 2)
43    FIELD(in_tile_global_mem, 1)
44    FIELD(in_image_global_mem, 1)
45    FIELD(alu_i2f_enable, 1)
46    FIELD(alu_square_enable, 1)
47    FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */
48    FIELD(alu_horz_proc_count, 6)
49    FIELD(alu_horz_proc_stride, 1)
50    FIELD(alu_vert_processing, 2)
51    FIELD(unused2, 1)
52    FIELD(alu_vert_proc_count, 6)
53    FIELD(alu_vert_proc_stride, 1)
54    FIELD(alu_nms_enable, 1)
55    FIELD(alu_pwl_enable, 1)
56    FIELD(alu_mult_enable, 1)
57    FIELD(alu_f2i_enable, 1)
58    FIELD(alu_load_pwl_lut, 1)
59    FIELD(alu_load_pwl_lut_global_mem, 1)
60 
61    /* 7 */
62    FIELD(in_tile_list_address, 32)
63 
64    /* 8 */
65    FIELD(in_tile_x_size, 16)
66    FIELD(in_tile_y_size, 16)
67 
68    /* 9 */
69    FIELD(in_tile_x_inc, 16)
70    FIELD(in_tile_y_inc, 16)
71 
72    /* 10 */
73    FIELD(in_image_base_address, 32)
74 
75    /* 11 */
76    FIELD(alu_load_pwl_lut_address, 32)
77 
78    /* 12 */
79    FIELD(out_tile_skip_at_border, 1)
80    FIELD(out_image_global_mem, 1)
81    FIELD(out_loop_1_reset, 1)
82    FIELD(out_loop_2_reset, 1)
83    FIELD(out_loop_3_reset, 1)
84    FIELD(out_brick_mode, 1)
85    FIELD(alu_z_filter_mode, 1)
86    FIELD(unused3, 1)
87    FIELD(in_window_z_start_overfetch, 2)
88    FIELD(unused4, 1)
89    FIELD(in_window_z_end_overfetch, 2)
90    FIELD(unused5, 1)
91    FIELD(alu_square_preshift, 4)
92    FIELD(in_image_data_type, 3)
93    FIELD(out_image_data_type, 3)
94    FIELD(unused6, 4)
95    FIELD(alu_pwl_sign_support, 1)
96    FIELD(alu_relu_enable, 1)
97    FIELD(no_flush, 1)
98    FIELD(last, 1)
99 
100    /* 13 */
101    FIELD(out_image_base_address, 32)
102 
103    /* 14 */
104    FIELD(out_loop_0_inc, 32)
105 
106    /* 15 */
107    FIELD(out_loop_1_inc, 32)
108 
109    /* 16 */
110    FIELD(out_loop_0_count, 16)
111    FIELD(out_loop_1_count, 16)
112 
113    /* 17 */
114    FIELD(out_loop_2_inc, 32)
115 
116    /* 18 */
117    FIELD(out_loop_3_inc, 32)
118 
119    /* 19 */
120    FIELD(out_loop_2_count, 16)
121    FIELD(out_loop_3_count, 16)
122 
123    /* 20 */
124    FIELD(out_loop_4_inc, 32)
125 
126    /* 21 */
127    FIELD(out_loop_5_inc, 32)
128 
129    /* 22 */
130    FIELD(out_loop_4_count, 16)
131    FIELD(out_loop_5_count, 16)
132 
133    /* 23 */
134    FIELD(out_loop_6_inc, 32)
135 
136    /* 24 */
137    FIELD(alu_filter_pwl_swap, 1)
138    FIELD(flat_rounding_mode, 2)
139    FIELD(integer_rounding_mode, 2)
140    FIELD(alu_input_preshift, 5)
141    FIELD(alu_output_postshift, 5)
142    FIELD(alu_reorder_bits_used, 4)
143    FIELD(alu_reorder_loop_2_mode, 1)
144    FIELD(unused7, 4)
145    FIELD(in_image_border_mode, 2)
146    FIELD(alu_output_postshift_5_6, 2)
147    FIELD(unused8, 4)
148 
149    /* 25 */
150    FIELD(in_image_circular_buf_size, 32)  /* >> 6 */
151 
152    /* 26 */
153    FIELD(in_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
154 
155    /* 27 */
156    FIELD(out_image_circular_buf_size, 32)  /* >> 6 */
157 
158    /* 28 */
159    FIELD(out_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
160 
161    /* 29 */
162    FIELD(in_image_border_const, 16)
163    FIELD(coef_zp, 8)
164    FIELD(in_zp, 8)
165 
166    /* 30 */
167    FIELD(out_zp, 8)
168    FIELD(alu_output_post_multiplier, 15)
169    FIELD(unused9, 9)
170 };
171 
172 static void
set_default_tp_config(struct etna_tp_params * map)173 set_default_tp_config(struct etna_tp_params *map)
174 {
175    map->unused0 = 0x0;
176    map->unused1 = 0x0;
177    map->in_window_x_start = 0x0;
178    map->in_window_y_start = 0x0;
179    map->in_tile_sequence = 0x0;
180    map->in_tile_global_mem = 0x0;
181    map->in_image_global_mem = 0x1;
182    map->alu_i2f_enable = 0x1;
183    map->alu_square_enable = 0x0;
184    map->alu_horz_processing = 0x0;
185    map->alu_horz_proc_count = 0x0;
186    map->alu_horz_proc_stride = 0x0;
187    map->alu_vert_processing = 0x0;
188    map->unused2 = 0x0;
189    map->alu_vert_proc_count = 0x0;
190    map->alu_vert_proc_stride = 0x0;
191    map->alu_nms_enable = 0x0;
192    map->alu_pwl_enable = 0x0;
193    map->alu_mult_enable = 0x0;
194    map->alu_f2i_enable = 0x1;
195    map->alu_load_pwl_lut = 0x0;
196    map->alu_load_pwl_lut_global_mem = 0x0;
197    map->in_tile_list_address = 0x0;
198    map->in_tile_x_size = 0x1;
199    map->in_tile_x_inc = 0x1;
200    map->alu_load_pwl_lut_address = 0x0;
201    map->out_tile_skip_at_border = 0x0;
202    map->out_image_global_mem = 0x1;
203    map->out_loop_1_reset = 0x0;
204    map->out_loop_2_reset = 0x0;
205    map->out_loop_3_reset = 0x0;
206    map->out_brick_mode = 0x0;
207    map->alu_z_filter_mode = 0x0;
208    map->unused3 = 0x0;
209    map->in_window_z_start_overfetch = 0x0;
210    map->unused4 = 0x0;
211    map->in_window_z_end_overfetch = 0x0;
212    map->unused5 = 0x0;
213    map->alu_square_preshift = 0x0;
214    map->in_image_data_type = 0x0;
215    map->out_image_data_type = 0x0;
216    map->unused6 = 0x0;
217    map->alu_pwl_sign_support = 0x0;
218    map->alu_relu_enable = 0x0;
219    map->no_flush = 0x0;
220    map->last = 0x1;
221    map->out_loop_0_inc = 0x1;
222    map->out_loop_3_inc = 0x0;
223    map->out_loop_3_count = 0x1;
224    map->out_loop_4_inc = 0x0;
225    map->out_loop_5_inc = 0x0;
226    map->out_loop_4_count = 0x1;
227    map->out_loop_5_count = 0x1;
228    map->out_loop_6_inc = 0x0;
229    map->alu_filter_pwl_swap = 0x0;
230    map->flat_rounding_mode = 0x1;
231    map->integer_rounding_mode = 0x1;
232    map->alu_input_preshift = 0x0;
233    map->alu_output_postshift = 0x0;
234    map->alu_reorder_bits_used = 0x0;
235    map->alu_reorder_loop_2_mode = 0x0;
236    map->unused7 = 0x0;
237    map->in_image_border_mode = 0x0;
238    map->alu_output_postshift_5_6 = 0x0;
239    map->unused8 = 0x0;
240    map->in_image_border_const = 0x0;
241    map->coef_zp = 0x0;
242    map->alu_output_post_multiplier = 0x0;
243    map->unused9 = 0x0;
244 }
245 
246 static struct etna_bo *
create_transpose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)247 create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
248 {
249    struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
250 
251    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
252 
253    struct etna_tp_params *map = etna_bo_map(bo);
254 
255    set_default_tp_config(map);
256 
257    map->in_image_x_size = operation->input_channels;
258    map->in_image_y_size = operation->input_height;
259    map->in_image_z_size = operation->input_width;
260    map->in_image_stride = operation->input_channels;
261    map->in_image_slice = operation->input_height * operation->input_channels;
262    map->in_window_x_end = operation->input_channels - 1;
263    map->in_window_y_end = operation->input_height - 1;
264    map->in_tile_y_size = operation->input_height;
265    map->in_tile_y_inc = operation->input_height;
266 
267    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
268    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
269    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
270 
271    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
272    offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
273    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
274 
275    map->out_loop_1_inc = operation->input_width * operation->input_height;
276    map->out_loop_0_count = operation->input_height;
277    map->out_loop_1_count = operation->input_channels;
278    map->out_loop_2_inc = operation->input_height;
279    map->out_loop_2_count = operation->input_width;
280    map->in_image_circular_buf_size = 0x0;
281    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
282    map->out_image_circular_buf_size = 0x0;
283    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
284    map->in_zp = operation->input_zero_point;
285    map->out_zp = operation->input_zero_point;
286    map->no_flush = 0x0;
287 
288    etna_bo_cpu_fini(bo);
289 
290    return bo;
291 }
292 
293 static struct etna_bo *
create_detranspose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)294 create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
295 {
296    unsigned input_width = operation->input_width;
297    unsigned input_height = operation->input_height;
298    unsigned input_channels = operation->input_channels;
299    struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
300 
301    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
302 
303    struct etna_tp_params *map = etna_bo_map(bo);
304 
305    set_default_tp_config(map);
306 
307    map->in_image_x_size = input_width;
308    map->in_image_y_size = input_height * input_channels;
309    map->in_image_z_size = 0x1;
310    map->in_image_stride = input_width;
311    map->in_image_slice = input_width * input_height * input_channels;
312    map->in_window_x_end = input_width - 1;
313    map->in_window_y_end = input_height * input_channels - 1;
314    map->in_tile_y_size = 0x1;
315    map->in_tile_y_inc = 0x1;
316 
317    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
318    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
319    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
320 
321    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
322    offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
323    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
324 
325    map->out_loop_0_inc = input_channels;
326    map->out_loop_1_inc = 0x0;
327    map->out_loop_0_count = input_height;
328    map->out_loop_1_count = 0x1;
329    map->out_loop_2_inc = input_height * input_channels;
330    map->out_loop_2_count = input_width;
331    map->out_loop_3_inc = 0x1;
332    map->out_loop_3_count = input_channels;
333    map->out_loop_4_inc = input_width * input_height * input_channels;
334    map->in_image_circular_buf_size = 0x0;
335    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
336    map->out_image_circular_buf_size = 0x0;
337    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
338    map->in_zp = operation->input_zero_point;
339    map->out_zp = operation->input_zero_point;
340 
341    etna_bo_cpu_fini(bo);
342 
343    return bo;
344 }
345 
346 static unsigned
split_reshuffle(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used,unsigned * in_dims,unsigned * out_dims,unsigned * pad_x_out,unsigned * pad_y_out)347 split_reshuffle(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims, unsigned *pad_x_out, unsigned *pad_y_out)
348 {
349    unsigned remaining_out_size, remaining_in_size;
350    unsigned dim_to_split = 0;
351 
352    if (out_dims[1] >= out_dims[dim_to_split])
353       dim_to_split = 1;
354 
355    if (out_dims[2] >= out_dims[dim_to_split])
356       dim_to_split = 2;
357 
358    remaining_in_size = in_dims[dim_to_split];
359    remaining_out_size = out_dims[dim_to_split];
360 
361    for (unsigned i = 0; i <= tp_core; i++) {
362       unsigned size = DIV_ROUND_UP(remaining_out_size, (tp_cores_used - i));
363       unsigned pad_x = 0;
364       unsigned pad_y = 0;
365 
366       if (operation->padding_same) {
367          if (operation->weight_width == 5) {
368             if (i == 0 || dim_to_split != 0)
369                pad_x++;
370 
371             if (i == 0 || dim_to_split != 1)
372                pad_y++;
373          }
374 
375          if (operation->input_width % 2)
376             if (i == 0 || dim_to_split != 0)
377                pad_x++;
378 
379          if (operation->input_height % 2)
380             if (i == 0 || dim_to_split != 1)
381                pad_y++;
382       }
383 
384       if (i < tp_cores_used - 1) {
385          in_dims[dim_to_split] = size;
386 
387          if (dim_to_split != 2)
388             in_dims[dim_to_split] *= operation->stride;
389 
390          if (dim_to_split == 0)
391             in_dims[dim_to_split] -= pad_x;
392          else if (dim_to_split == 1)
393             in_dims[dim_to_split] -= pad_y;
394 
395          remaining_in_size -= in_dims[dim_to_split];
396       } else
397          in_dims[dim_to_split] = remaining_in_size;
398 
399       if (i == tp_core) {
400          if (pad_x_out)
401             *pad_x_out = pad_x;
402          if (pad_y_out)
403             *pad_y_out = pad_y;
404       }
405 
406       out_dims[dim_to_split] = size;
407 
408       remaining_out_size -= size;
409    }
410 
411    return dim_to_split;
412 }
413 
414 static struct etna_bo *
create_reshuffle_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)415 create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
416                         unsigned tp_core, unsigned tp_cores_used)
417 {
418    struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
419    unsigned input_width = operation->input_width;
420    unsigned input_height = operation->input_height;
421    unsigned output_width = operation->output_width;
422    unsigned output_height = operation->output_height;
423    unsigned in_dims[3];
424    unsigned out_dims[3];
425 
426    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
427 
428    struct etna_tp_params *map = etna_bo_map(bo);
429 
430    set_default_tp_config(map);
431 
432    if (input_height > input_width) {
433       SWAP(input_width, input_height);
434       SWAP(output_width, output_height);
435    }
436 
437    in_dims[0] = input_width;
438    in_dims[1] = input_height;
439    in_dims[2] = operation->input_channels;
440 
441    out_dims[0] = output_width;
442    out_dims[1] = output_height;
443    out_dims[2] = operation->input_channels;
444 
445    unsigned pad_x = 0;
446    unsigned pad_y = 0;
447    unsigned split_dim = split_reshuffle(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims, &pad_x, &pad_y);
448 
449    map->in_image_x_size = in_dims[0];
450    map->in_image_y_size = in_dims[1];
451    map->in_image_z_size = in_dims[2];
452 
453    ML_DBG("map->in_image_z_size %d in_dims[2] %d split_dim %d\n", map->in_image_z_size, in_dims[2], split_dim);
454 
455    map->in_image_stride = operation->input_height;
456    map->in_image_slice = input_width * input_height;
457 
458    map->in_window_x_start = 0x0 - pad_x;
459    map->in_window_y_start = 0x0 - pad_y;
460 
461    unsigned out_loop_0_count = 0x2;
462    map->in_window_x_end = out_dims[0] * out_loop_0_count - 1 - pad_x;
463    map->in_window_y_end = out_dims[1] * 2 - 1 - pad_y;
464    map->in_tile_x_size = out_dims[0] * out_loop_0_count;
465    map->in_tile_x_inc = map->in_tile_x_size;
466    map->in_tile_y_size = out_dims[1] * 2;
467    map->in_tile_y_inc = out_dims[1] * 2;
468 
469    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
470    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
471    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
472 
473    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
474    offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
475    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
476 
477    for (unsigned i = 0; i < tp_core; i++) {
478       unsigned in_dims[3];
479       unsigned out_dims[3];
480       unsigned in_offset = 0;
481       unsigned out_offset = 0;
482 
483       in_dims[0] = input_width;
484       in_dims[1] = input_height;
485       in_dims[2] = operation->input_channels;
486 
487       out_dims[0] = output_width;
488       out_dims[1] = output_height;
489       out_dims[2] = operation->input_channels;
490 
491       unsigned split_dim = split_reshuffle(subgraph, operation, i, tp_cores_used, in_dims, out_dims, NULL, NULL);
492 
493       switch(split_dim) {
494          case 0:
495             in_offset = in_dims[0];
496             out_offset = out_dims[0];
497             break;
498          case 1:
499             in_offset = map->in_image_stride * in_dims[1];
500             out_offset = output_height * out_dims[1];
501             break;
502          case 2:
503             in_offset = map->in_image_slice * in_dims[2];
504             out_offset = out_dims[2] * map->in_tile_x_size * map->in_tile_y_size;
505             break;
506          default:
507             break;
508       }
509 
510       map->in_image_base_address += in_offset;
511       map->out_image_base_address += out_offset;
512    }
513 
514    map->out_loop_1_reset = 0x1;
515    map->out_loop_2_reset = 0x0;
516    map->out_loop_3_reset = 0x1;
517    map->out_loop_0_inc = output_width * output_height;
518    map->out_loop_1_inc = 0x1;
519    map->out_loop_0_count = out_loop_0_count;
520    map->out_loop_1_count = out_dims[0];
521    map->out_loop_2_count = out_loop_0_count;
522    map->out_loop_3_count = out_dims[1];
523    map->out_loop_2_inc = map->out_loop_0_inc * 2;
524    map->out_loop_3_inc = output_width;
525    map->out_loop_6_inc = map->out_loop_0_inc * 4;
526 
527    map->in_zp = operation->input_zero_point;
528    map->out_zp = operation->input_zero_point;
529 
530    if (tp_cores_used > 1)
531       map->no_flush = tp_core < tp_cores_used - 1;
532 
533    map->in_image_circular_buf_size = 0x0;
534    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
535    map->out_image_circular_buf_size = 0x0;
536    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
537 
538    etna_bo_cpu_fini(bo);
539 
540    return bo;
541 }
542 
543 
544 static void
split_pad(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used,unsigned * in_dims,unsigned * out_dims)545 split_pad(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
546           unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims)
547 {
548    unsigned remaining_in_size;
549    unsigned dim_to_split = 2;
550 
551    remaining_in_size = in_dims[dim_to_split];
552 
553    for (unsigned i = 0; i <= tp_core; i++) {
554       unsigned size = DIV_ROUND_UP(remaining_in_size, (tp_cores_used - i));
555 
556       if (i < tp_cores_used - 1) {
557          in_dims[dim_to_split] = size;
558          remaining_in_size -= in_dims[dim_to_split];
559       } else
560          in_dims[dim_to_split] = remaining_in_size;
561 
562       out_dims[dim_to_split] = size;
563    }
564 }
565 
566 static struct etna_bo *
create_pad_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)567 create_pad_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
568                   unsigned tp_core, unsigned tp_cores_used)
569 {
570    struct pipe_context *pctx = subgraph->base.context;
571    struct etna_bo *bo = etna_ml_create_bo(pctx, sizeof(struct etna_tp_params));
572    unsigned input_width = operation->input_width;
573    unsigned input_height = operation->input_height;
574    unsigned input_channels = operation->input_channels;
575    unsigned output_width = operation->output_width;
576    unsigned output_height = operation->output_height;
577    unsigned output_channels = operation->output_channels;
578    unsigned in_dims[3];
579    unsigned out_dims[3];
580 
581    SWAP(input_width, input_height);
582    SWAP(output_width, output_height);
583 
584    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
585 
586    struct etna_tp_params *map = etna_bo_map(bo);
587 
588    set_default_tp_config(map);
589 
590    in_dims[0] = input_width;
591    in_dims[1] = input_height;
592    in_dims[2] = input_channels;
593 
594    out_dims[0] = output_width;
595    out_dims[1] = output_height;
596    out_dims[2] = output_channels;
597 
598    split_pad(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims);
599 
600    map->in_image_x_size = in_dims[0];
601    map->in_image_y_size = in_dims[1];
602    map->in_image_z_size = in_dims[2];
603 
604    map->in_image_stride = input_width;
605    map->in_image_slice = input_width * input_height;
606 
607    map->in_window_x_start = 0xffff;
608    map->in_window_y_start = 0xffff;
609 
610    map->in_window_x_end = in_dims[0];
611    map->in_window_y_end = in_dims[1];
612    map->in_tile_x_size = out_dims[0];
613    map->in_tile_x_inc = out_dims[0];
614    map->in_tile_y_size = out_dims[1];
615    map->in_tile_y_inc = out_dims[1];
616 
617    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
618    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
619    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
620 
621    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
622    offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
623    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
624 
625    for (unsigned i = 0; i < tp_core; i++) {
626       unsigned in_dims[3];
627       unsigned out_dims[3];
628       unsigned in_offset = 0;
629       unsigned out_offset = 0;
630 
631       in_dims[0] = input_width;
632       in_dims[1] = input_height;
633       in_dims[2] = input_channels;
634 
635       out_dims[0] = output_width;
636       out_dims[1] = output_height;
637       out_dims[2] = output_channels;
638 
639       split_pad(subgraph, operation, i, tp_cores_used, in_dims, out_dims);
640 
641       in_offset = map->in_image_slice * in_dims[2];
642       out_offset = out_dims[2];
643       out_offset *= map->in_tile_x_size * map->in_tile_y_size;
644 
645       map->in_image_base_address += in_offset;
646       map->out_image_base_address += out_offset;
647    }
648 
649    map->out_loop_1_reset = 0x0;
650    map->out_loop_2_reset = 0x0;
651    map->out_loop_3_reset = 0x0;
652    map->out_loop_0_inc = 0x0;
653    map->out_loop_1_inc = 0x1;
654    map->out_loop_0_count = 0x1;
655    map->out_loop_1_count = out_dims[0];
656    map->out_loop_2_count = out_dims[1];
657    map->out_loop_3_count = 0x1;
658    map->out_loop_2_inc = out_dims[0];
659    map->out_loop_3_inc = 0x0;
660    map->out_loop_6_inc = out_dims[0] * out_dims[1];
661 
662    map->in_zp = operation->input_zero_point;
663    map->out_zp = operation->output_zero_point;
664 
665    if (tp_cores_used > 1)
666       map->no_flush = tp_core < tp_cores_used - 1;
667 
668    map->in_image_circular_buf_size = 0x0;
669    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
670    map->out_image_circular_buf_size = 0x0;
671    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
672 
673    etna_bo_cpu_fini(bo);
674 
675    return bo;
676 }
677 
678 static inline uint8_t
etna_tensor_zero_point(const struct pipe_tensor * tensor)679 etna_tensor_zero_point(const struct pipe_tensor *tensor)
680 {
681    if (tensor->is_signed) {
682       assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
683       return tensor->zero_point + 128;
684    } else {
685       assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
686       return tensor->zero_point;
687    }
688 }
689 
690 void
etna_ml_lower_transpose(struct etna_ml_subgraph * subgraph,const struct pipe_tensor * input_tensor,struct etna_operation * operation,unsigned * output_tensor)691 etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
692                         const struct pipe_tensor *input_tensor,
693                         struct etna_operation *operation,
694                         unsigned *output_tensor)
695 {
696    operation->type = ETNA_JOB_TYPE_TP;
697    operation->tp_type = ETNA_ML_TP_TRANSPOSE;
698 
699    operation->input_tensors[0] = input_tensor->index;
700    operation->input_count = 1;
701    operation->input_width = input_tensor->dims[1];
702    operation->input_height = input_tensor->dims[2];
703    operation->input_channels = input_tensor->dims[3];
704    operation->input_zero_point = etna_tensor_zero_point(input_tensor);
705    operation->input_scale = input_tensor->scale;
706    operation->input_tensor_sizes[0] = operation->input_width *
707                                       operation->input_height *
708                                       operation->input_channels;
709 
710    *output_tensor = etna_ml_allocate_tensor(subgraph);
711    operation->output_tensors[0] = *output_tensor;
712    operation->output_width = operation->input_width;
713    operation->output_height = operation->input_height;
714    operation->output_channels = operation->input_channels;
715    operation->output_zero_point = operation->input_zero_point;
716    operation->output_scale = operation->input_scale;
717    operation->output_tensor_sizes[0] = operation->output_width *
718                                        operation->output_height *
719                                        operation->output_channels;
720 }
721 
722 void
etna_ml_lower_detranspose(struct etna_ml_subgraph * subgraph,struct etna_operation * convolution,struct etna_operation * operation)723 etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
724                           struct etna_operation *convolution,
725                           struct etna_operation *operation)
726 {
727    operation->type = ETNA_JOB_TYPE_TP;
728    operation->tp_type = ETNA_ML_TP_DETRANSPOSE;
729 
730    operation->input_tensors[0] = etna_ml_allocate_tensor(subgraph);
731    operation->input_count = 1;
732    operation->input_width = convolution->output_width;
733    operation->input_height = convolution->output_height;
734    operation->input_channels = convolution->output_channels;
735    operation->input_zero_point = convolution->output_zero_point;
736    operation->input_scale = convolution->output_scale;
737    operation->input_tensor_sizes[0] = operation->input_width *
738                                       operation->input_height *
739                                       operation->input_channels;
740 
741    operation->output_tensors[0] = convolution->output_tensors[0];
742    operation->output_count = 1;
743    operation->output_width = convolution->output_width;
744    operation->output_height = convolution->output_height;
745    operation->output_channels = convolution->output_channels;
746    operation->output_zero_point = convolution->output_zero_point;
747    operation->output_scale = convolution->output_scale;
748    operation->output_tensor_sizes[0] = operation->output_width *
749                                        operation->output_height *
750                                        operation->output_channels;
751 }
752 
753 void
etna_ml_lower_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * convolution,struct etna_operation * operation,unsigned * output_tensor)754 etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
755                         const struct pipe_ml_operation *convolution,
756                         struct etna_operation *operation,
757                         unsigned *output_tensor)
758 {
759    operation->type = ETNA_JOB_TYPE_TP;
760    operation->tp_type = ETNA_ML_TP_RESHUFFLE;
761    operation->stride = convolution->conv.stride_x;
762    operation->padding_same = convolution->conv.padding_same;
763 
764    operation->input_tensors[0] = convolution->input_tensors[0]->index;
765    operation->input_count = 1;
766    operation->input_width = convolution->input_tensors[0]->dims[1];
767    operation->input_height = convolution->input_tensors[0]->dims[2];
768    operation->input_channels = convolution->input_tensors[0]->dims[3];
769    operation->input_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
770    operation->input_scale = convolution->input_tensors[0]->scale;
771    operation->input_tensor_sizes[0] = operation->input_width *
772                                       operation->input_height *
773                                       operation->input_channels;
774 
775    *output_tensor = etna_ml_allocate_tensor(subgraph);
776    operation->output_tensors[0] = *output_tensor;
777    operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
778    operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
779    operation->output_channels = operation->input_channels * operation->stride * operation->stride;
780    operation->output_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
781    operation->output_scale = convolution->input_tensors[0]->scale;
782    operation->output_tensor_sizes[0] = operation->output_width *
783                                        operation->output_height *
784                                        operation->output_channels;
785 
786    /* When destriding a convolution, the transformation to be made to the input
787     * tensor will depend on the size of the weight tensor.
788     */
789    operation->weight_width = convolution->conv.weight_tensor->dims[1];
790    operation->weight_height = convolution->conv.weight_tensor->dims[2];
791 
792    if (operation->padding_same) {
793       if (operation->weight_width == 5) {
794          operation->output_width += 2;
795          operation->output_height += 2;
796       } else {
797          operation->output_width += 1;
798          operation->output_height += 1;
799       }
800    }
801 }
802 
803 void
etna_ml_lower_pad(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * pad,struct etna_operation * operation)804 etna_ml_lower_pad(struct etna_ml_subgraph *subgraph,
805                   const struct pipe_ml_operation *pad,
806                   struct etna_operation *operation)
807 {
808    operation->type = ETNA_JOB_TYPE_TP;
809    operation->tp_type = ETNA_ML_TP_PAD;
810    operation->stride = 1;
811 
812    operation->input_tensors[0] = pad->input_tensors[0]->index;
813    operation->input_count = 1;
814    operation->input_width = pad->input_tensors[0]->dims[1];
815    operation->input_height = pad->input_tensors[0]->dims[2];
816    operation->input_channels = pad->input_tensors[0]->dims[3];
817    operation->input_tensor_sizes[0] = operation->input_width *
818                                       operation->input_height *
819                                       operation->input_channels;
820    operation->input_zero_point = pad->input_tensors[0]->zero_point;
821    operation->input_scale = pad->input_tensors[0]->scale;
822 
823    operation->output_tensors[0] = pad->output_tensors[0]->index;
824    operation->output_width = pad->output_tensors[0]->dims[1];
825    operation->output_height = pad->output_tensors[0]->dims[2];
826    operation->output_channels = pad->output_tensors[0]->dims[3];
827    operation->output_zero_point = pad->output_tensors[0]->zero_point;
828    operation->output_scale = pad->output_tensors[0]->scale;
829    operation->output_tensor_sizes[0] = operation->output_width *
830                                        operation->output_height *
831                                        operation->output_channels;
832 }
833 
834 void
etna_ml_compile_operation_tp(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)835 etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
836                              const struct etna_operation *operation,
837                              struct etna_vip_instruction *instruction)
838 {
839    struct etna_context *ctx = etna_context(subgraph->base.context);
840    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
841    assert(input);
842    pipe_resource_reference(&instruction->input, input);
843 
844    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
845    assert(output);
846    pipe_resource_reference(&instruction->output, output);
847 
848    instruction->input_offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
849    instruction->output_offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
850 
851    switch (operation->tp_type) {
852    case ETNA_ML_TP_TRANSPOSE:
853       instruction->configs[0] = create_transpose_config(subgraph, operation);
854       break;
855    case ETNA_ML_TP_DETRANSPOSE:
856       instruction->configs[0] = create_detranspose_config(subgraph, operation);
857       break;
858    case ETNA_ML_TP_RESHUFFLE: {
859       unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
860       unsigned tp_cores_used;
861 
862       tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
863 
864       /* TODO: Run among the 4 cores for faster performance */
865       if ((operation->input_width == 320 || operation->input_width == 224) &&
866           operation->input_channels == 3)
867          tp_cores_used = 1;
868 
869       ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
870       for (unsigned i = 0; i < tp_cores_used; i++) {
871          instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used);
872       }
873       break;
874    }
875    case ETNA_ML_TP_PAD: {
876       unsigned tp_cores_used = etna_ml_get_core_info(ctx)->tp_core_count;
877 
878       ML_DBG("pad: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
879       for (unsigned i = 0; i < tp_cores_used; i++) {
880          instruction->configs[i] = create_pad_config(subgraph, operation, i, tp_cores_used);
881       }
882       break;
883    }
884    }
885    instruction->type = ETNA_JOB_TYPE_TP;
886    instruction->tp_type = operation->tp_type;
887 }
888 
889 void
etna_ml_emit_operation_tp(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)890 etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
891                           struct etna_vip_instruction *operation,
892                           unsigned idx)
893 {
894    struct etna_context *ctx = etna_context(subgraph->base.context);
895    unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
896    struct etna_cmd_stream *stream = ctx->stream;
897    bool more_than_one_tp_job = operation->configs[1] != NULL;
898    bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
899 
900    for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
901       unsigned offset = parallel ? idx + 1 : 0;
902 
903       if (more_than_one_tp_job && (j < tp_core_count - 1))
904             offset = parallel ? 0x1f : 0x1;
905 
906       etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
907       etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
908       etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0);
909 
910       if (operation->tp_type == ETNA_ML_TP_PAD) {
911          etna_set_state(stream, VIVS_GL_UNK03950, j < tp_core_count - 1 ? 0x8 : 0x0);
912       } else {
913          etna_set_state(stream, VIVS_GL_UNK03950, 0x0);
914       }
915 
916       etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) {
917          .bo = operation->configs[j],
918          .flags = ETNA_RELOC_READ,
919          .offset = offset,
920       });
921    }
922    etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0);
923 }
924