1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "hw/state.xml.h"
7 #include "util/u_inlines.h"
8
9 #include "etnaviv_context.h"
10 #include "etnaviv_debug.h"
11 #include "etnaviv_emit.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_tp.h"
14
15 #define FIELD(field, bits) uint32_t field : bits;
16
17 struct etna_tp_params {
18 /* 0 */
19 FIELD(in_image_x_size, 16)
20 FIELD(unused0, 16)
21
22 /* 1 */
23 FIELD(in_image_y_size, 16)
24 FIELD(in_image_z_size, 16)
25
26 /* 2 */
27 FIELD(in_image_stride, 16)
28 FIELD(unused1, 16)
29
30 /* 3 */
31 FIELD(in_image_slice, 32)
32
33 /* 4 */
34 FIELD(in_window_x_start, 16)
35 FIELD(in_window_y_start, 16)
36
37 /* 5 */
38 FIELD(in_window_x_end, 16)
39 FIELD(in_window_y_end, 16)
40
41 /* 6 */
42 FIELD(in_tile_sequence, 2)
43 FIELD(in_tile_global_mem, 1)
44 FIELD(in_image_global_mem, 1)
45 FIELD(alu_i2f_enable, 1)
46 FIELD(alu_square_enable, 1)
47 FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */
48 FIELD(alu_horz_proc_count, 6)
49 FIELD(alu_horz_proc_stride, 1)
50 FIELD(alu_vert_processing, 2)
51 FIELD(unused2, 1)
52 FIELD(alu_vert_proc_count, 6)
53 FIELD(alu_vert_proc_stride, 1)
54 FIELD(alu_nms_enable, 1)
55 FIELD(alu_pwl_enable, 1)
56 FIELD(alu_mult_enable, 1)
57 FIELD(alu_f2i_enable, 1)
58 FIELD(alu_load_pwl_lut, 1)
59 FIELD(alu_load_pwl_lut_global_mem, 1)
60
61 /* 7 */
62 FIELD(in_tile_list_address, 32)
63
64 /* 8 */
65 FIELD(in_tile_x_size, 16)
66 FIELD(in_tile_y_size, 16)
67
68 /* 9 */
69 FIELD(in_tile_x_inc, 16)
70 FIELD(in_tile_y_inc, 16)
71
72 /* 10 */
73 FIELD(in_image_base_address, 32)
74
75 /* 11 */
76 FIELD(alu_load_pwl_lut_address, 32)
77
78 /* 12 */
79 FIELD(out_tile_skip_at_border, 1)
80 FIELD(out_image_global_mem, 1)
81 FIELD(out_loop_1_reset, 1)
82 FIELD(out_loop_2_reset, 1)
83 FIELD(out_loop_3_reset, 1)
84 FIELD(out_brick_mode, 1)
85 FIELD(alu_z_filter_mode, 1)
86 FIELD(unused3, 1)
87 FIELD(in_window_z_start_overfetch, 2)
88 FIELD(unused4, 1)
89 FIELD(in_window_z_end_overfetch, 2)
90 FIELD(unused5, 1)
91 FIELD(alu_square_preshift, 4)
92 FIELD(in_image_data_type, 3)
93 FIELD(out_image_data_type, 3)
94 FIELD(unused6, 4)
95 FIELD(alu_pwl_sign_support, 1)
96 FIELD(alu_relu_enable, 1)
97 FIELD(no_flush, 1)
98 FIELD(last, 1)
99
100 /* 13 */
101 FIELD(out_image_base_address, 32)
102
103 /* 14 */
104 FIELD(out_loop_0_inc, 32)
105
106 /* 15 */
107 FIELD(out_loop_1_inc, 32)
108
109 /* 16 */
110 FIELD(out_loop_0_count, 16)
111 FIELD(out_loop_1_count, 16)
112
113 /* 17 */
114 FIELD(out_loop_2_inc, 32)
115
116 /* 18 */
117 FIELD(out_loop_3_inc, 32)
118
119 /* 19 */
120 FIELD(out_loop_2_count, 16)
121 FIELD(out_loop_3_count, 16)
122
123 /* 20 */
124 FIELD(out_loop_4_inc, 32)
125
126 /* 21 */
127 FIELD(out_loop_5_inc, 32)
128
129 /* 22 */
130 FIELD(out_loop_4_count, 16)
131 FIELD(out_loop_5_count, 16)
132
133 /* 23 */
134 FIELD(out_loop_6_inc, 32)
135
136 /* 24 */
137 FIELD(alu_filter_pwl_swap, 1)
138 FIELD(flat_rounding_mode, 2)
139 FIELD(integer_rounding_mode, 2)
140 FIELD(alu_input_preshift, 5)
141 FIELD(alu_output_postshift, 5)
142 FIELD(alu_reorder_bits_used, 4)
143 FIELD(alu_reorder_loop_2_mode, 1)
144 FIELD(unused7, 4)
145 FIELD(in_image_border_mode, 2)
146 FIELD(alu_output_postshift_5_6, 2)
147 FIELD(unused8, 4)
148
149 /* 25 */
150 FIELD(in_image_circular_buf_size, 32) /* >> 6 */
151
152 /* 26 */
153 FIELD(in_image_circular_buf_end_address_plus_1, 32) /* >> 6 */
154
155 /* 27 */
156 FIELD(out_image_circular_buf_size, 32) /* >> 6 */
157
158 /* 28 */
159 FIELD(out_image_circular_buf_end_address_plus_1, 32) /* >> 6 */
160
161 /* 29 */
162 FIELD(in_image_border_const, 16)
163 FIELD(coef_zp, 8)
164 FIELD(in_zp, 8)
165
166 /* 30 */
167 FIELD(out_zp, 8)
168 FIELD(alu_output_post_multiplier, 15)
169 FIELD(unused9, 9)
170 };
171
172 static void
set_default_tp_config(struct etna_tp_params * map)173 set_default_tp_config(struct etna_tp_params *map)
174 {
175 map->unused0 = 0x0;
176 map->unused1 = 0x0;
177 map->in_window_x_start = 0x0;
178 map->in_window_y_start = 0x0;
179 map->in_tile_sequence = 0x0;
180 map->in_tile_global_mem = 0x0;
181 map->in_image_global_mem = 0x1;
182 map->alu_i2f_enable = 0x1;
183 map->alu_square_enable = 0x0;
184 map->alu_horz_processing = 0x0;
185 map->alu_horz_proc_count = 0x0;
186 map->alu_horz_proc_stride = 0x0;
187 map->alu_vert_processing = 0x0;
188 map->unused2 = 0x0;
189 map->alu_vert_proc_count = 0x0;
190 map->alu_vert_proc_stride = 0x0;
191 map->alu_nms_enable = 0x0;
192 map->alu_pwl_enable = 0x0;
193 map->alu_mult_enable = 0x0;
194 map->alu_f2i_enable = 0x1;
195 map->alu_load_pwl_lut = 0x0;
196 map->alu_load_pwl_lut_global_mem = 0x0;
197 map->in_tile_list_address = 0x0;
198 map->in_tile_x_size = 0x1;
199 map->in_tile_x_inc = 0x1;
200 map->alu_load_pwl_lut_address = 0x0;
201 map->out_tile_skip_at_border = 0x0;
202 map->out_image_global_mem = 0x1;
203 map->out_loop_1_reset = 0x0;
204 map->out_loop_2_reset = 0x0;
205 map->out_loop_3_reset = 0x0;
206 map->out_brick_mode = 0x0;
207 map->alu_z_filter_mode = 0x0;
208 map->unused3 = 0x0;
209 map->in_window_z_start_overfetch = 0x0;
210 map->unused4 = 0x0;
211 map->in_window_z_end_overfetch = 0x0;
212 map->unused5 = 0x0;
213 map->alu_square_preshift = 0x0;
214 map->in_image_data_type = 0x0;
215 map->out_image_data_type = 0x0;
216 map->unused6 = 0x0;
217 map->alu_pwl_sign_support = 0x0;
218 map->alu_relu_enable = 0x0;
219 map->no_flush = 0x0;
220 map->last = 0x1;
221 map->out_loop_0_inc = 0x1;
222 map->out_loop_3_inc = 0x0;
223 map->out_loop_3_count = 0x1;
224 map->out_loop_4_inc = 0x0;
225 map->out_loop_5_inc = 0x0;
226 map->out_loop_4_count = 0x1;
227 map->out_loop_5_count = 0x1;
228 map->out_loop_6_inc = 0x0;
229 map->alu_filter_pwl_swap = 0x0;
230 map->flat_rounding_mode = 0x1;
231 map->integer_rounding_mode = 0x1;
232 map->alu_input_preshift = 0x0;
233 map->alu_output_postshift = 0x0;
234 map->alu_reorder_bits_used = 0x0;
235 map->alu_reorder_loop_2_mode = 0x0;
236 map->unused7 = 0x0;
237 map->in_image_border_mode = 0x0;
238 map->alu_output_postshift_5_6 = 0x0;
239 map->unused8 = 0x0;
240 map->in_image_border_const = 0x0;
241 map->coef_zp = 0x0;
242 map->alu_output_post_multiplier = 0x0;
243 map->unused9 = 0x0;
244 }
245
246 static struct etna_bo *
create_transpose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)247 create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
248 {
249 struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
250
251 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
252
253 struct etna_tp_params *map = etna_bo_map(bo);
254
255 set_default_tp_config(map);
256
257 map->in_image_x_size = operation->input_channels;
258 map->in_image_y_size = operation->input_height;
259 map->in_image_z_size = operation->input_width;
260 map->in_image_stride = operation->input_channels;
261 map->in_image_slice = operation->input_height * operation->input_channels;
262 map->in_window_x_end = operation->input_channels - 1;
263 map->in_window_y_end = operation->input_height - 1;
264 map->in_tile_y_size = operation->input_height;
265 map->in_tile_y_inc = operation->input_height;
266
267 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
268 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
269 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
270
271 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
272 offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
273 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
274
275 map->out_loop_1_inc = operation->input_width * operation->input_height;
276 map->out_loop_0_count = operation->input_height;
277 map->out_loop_1_count = operation->input_channels;
278 map->out_loop_2_inc = operation->input_height;
279 map->out_loop_2_count = operation->input_width;
280 map->in_image_circular_buf_size = 0x0;
281 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
282 map->out_image_circular_buf_size = 0x0;
283 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
284 map->in_zp = operation->input_zero_point;
285 map->out_zp = operation->input_zero_point;
286 map->no_flush = 0x0;
287
288 etna_bo_cpu_fini(bo);
289
290 return bo;
291 }
292
293 static struct etna_bo *
create_detranspose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)294 create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
295 {
296 unsigned input_width = operation->input_width;
297 unsigned input_height = operation->input_height;
298 unsigned input_channels = operation->input_channels;
299 struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
300
301 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
302
303 struct etna_tp_params *map = etna_bo_map(bo);
304
305 set_default_tp_config(map);
306
307 map->in_image_x_size = input_width;
308 map->in_image_y_size = input_height * input_channels;
309 map->in_image_z_size = 0x1;
310 map->in_image_stride = input_width;
311 map->in_image_slice = input_width * input_height * input_channels;
312 map->in_window_x_end = input_width - 1;
313 map->in_window_y_end = input_height * input_channels - 1;
314 map->in_tile_y_size = 0x1;
315 map->in_tile_y_inc = 0x1;
316
317 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
318 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
319 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
320
321 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
322 offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
323 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
324
325 map->out_loop_0_inc = input_channels;
326 map->out_loop_1_inc = 0x0;
327 map->out_loop_0_count = input_height;
328 map->out_loop_1_count = 0x1;
329 map->out_loop_2_inc = input_height * input_channels;
330 map->out_loop_2_count = input_width;
331 map->out_loop_3_inc = 0x1;
332 map->out_loop_3_count = input_channels;
333 map->out_loop_4_inc = input_width * input_height * input_channels;
334 map->in_image_circular_buf_size = 0x0;
335 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
336 map->out_image_circular_buf_size = 0x0;
337 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
338 map->in_zp = operation->input_zero_point;
339 map->out_zp = operation->input_zero_point;
340
341 etna_bo_cpu_fini(bo);
342
343 return bo;
344 }
345
346 static unsigned
split_reshuffle(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used,unsigned * in_dims,unsigned * out_dims,unsigned * pad_x_out,unsigned * pad_y_out)347 split_reshuffle(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims, unsigned *pad_x_out, unsigned *pad_y_out)
348 {
349 unsigned remaining_out_size, remaining_in_size;
350 unsigned dim_to_split = 0;
351
352 if (out_dims[1] >= out_dims[dim_to_split])
353 dim_to_split = 1;
354
355 if (out_dims[2] >= out_dims[dim_to_split])
356 dim_to_split = 2;
357
358 remaining_in_size = in_dims[dim_to_split];
359 remaining_out_size = out_dims[dim_to_split];
360
361 for (unsigned i = 0; i <= tp_core; i++) {
362 unsigned size = DIV_ROUND_UP(remaining_out_size, (tp_cores_used - i));
363 unsigned pad_x = 0;
364 unsigned pad_y = 0;
365
366 if (operation->padding_same) {
367 if (operation->weight_width == 5) {
368 if (i == 0 || dim_to_split != 0)
369 pad_x++;
370
371 if (i == 0 || dim_to_split != 1)
372 pad_y++;
373 }
374
375 if (operation->input_width % 2)
376 if (i == 0 || dim_to_split != 0)
377 pad_x++;
378
379 if (operation->input_height % 2)
380 if (i == 0 || dim_to_split != 1)
381 pad_y++;
382 }
383
384 if (i < tp_cores_used - 1) {
385 in_dims[dim_to_split] = size;
386
387 if (dim_to_split != 2)
388 in_dims[dim_to_split] *= operation->stride;
389
390 if (dim_to_split == 0)
391 in_dims[dim_to_split] -= pad_x;
392 else if (dim_to_split == 1)
393 in_dims[dim_to_split] -= pad_y;
394
395 remaining_in_size -= in_dims[dim_to_split];
396 } else
397 in_dims[dim_to_split] = remaining_in_size;
398
399 if (i == tp_core) {
400 if (pad_x_out)
401 *pad_x_out = pad_x;
402 if (pad_y_out)
403 *pad_y_out = pad_y;
404 }
405
406 out_dims[dim_to_split] = size;
407
408 remaining_out_size -= size;
409 }
410
411 return dim_to_split;
412 }
413
414 static struct etna_bo *
create_reshuffle_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)415 create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
416 unsigned tp_core, unsigned tp_cores_used)
417 {
418 struct etna_bo *bo = etna_ml_create_bo(subgraph->base.context, sizeof(struct etna_tp_params));
419 unsigned input_width = operation->input_width;
420 unsigned input_height = operation->input_height;
421 unsigned output_width = operation->output_width;
422 unsigned output_height = operation->output_height;
423 unsigned in_dims[3];
424 unsigned out_dims[3];
425
426 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
427
428 struct etna_tp_params *map = etna_bo_map(bo);
429
430 set_default_tp_config(map);
431
432 if (input_height > input_width) {
433 SWAP(input_width, input_height);
434 SWAP(output_width, output_height);
435 }
436
437 in_dims[0] = input_width;
438 in_dims[1] = input_height;
439 in_dims[2] = operation->input_channels;
440
441 out_dims[0] = output_width;
442 out_dims[1] = output_height;
443 out_dims[2] = operation->input_channels;
444
445 unsigned pad_x = 0;
446 unsigned pad_y = 0;
447 unsigned split_dim = split_reshuffle(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims, &pad_x, &pad_y);
448
449 map->in_image_x_size = in_dims[0];
450 map->in_image_y_size = in_dims[1];
451 map->in_image_z_size = in_dims[2];
452
453 ML_DBG("map->in_image_z_size %d in_dims[2] %d split_dim %d\n", map->in_image_z_size, in_dims[2], split_dim);
454
455 map->in_image_stride = operation->input_height;
456 map->in_image_slice = input_width * input_height;
457
458 map->in_window_x_start = 0x0 - pad_x;
459 map->in_window_y_start = 0x0 - pad_y;
460
461 unsigned out_loop_0_count = 0x2;
462 map->in_window_x_end = out_dims[0] * out_loop_0_count - 1 - pad_x;
463 map->in_window_y_end = out_dims[1] * 2 - 1 - pad_y;
464 map->in_tile_x_size = out_dims[0] * out_loop_0_count;
465 map->in_tile_x_inc = map->in_tile_x_size;
466 map->in_tile_y_size = out_dims[1] * 2;
467 map->in_tile_y_inc = out_dims[1] * 2;
468
469 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
470 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
471 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
472
473 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
474 offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
475 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
476
477 for (unsigned i = 0; i < tp_core; i++) {
478 unsigned in_dims[3];
479 unsigned out_dims[3];
480 unsigned in_offset = 0;
481 unsigned out_offset = 0;
482
483 in_dims[0] = input_width;
484 in_dims[1] = input_height;
485 in_dims[2] = operation->input_channels;
486
487 out_dims[0] = output_width;
488 out_dims[1] = output_height;
489 out_dims[2] = operation->input_channels;
490
491 unsigned split_dim = split_reshuffle(subgraph, operation, i, tp_cores_used, in_dims, out_dims, NULL, NULL);
492
493 switch(split_dim) {
494 case 0:
495 in_offset = in_dims[0];
496 out_offset = out_dims[0];
497 break;
498 case 1:
499 in_offset = map->in_image_stride * in_dims[1];
500 out_offset = output_height * out_dims[1];
501 break;
502 case 2:
503 in_offset = map->in_image_slice * in_dims[2];
504 out_offset = out_dims[2] * map->in_tile_x_size * map->in_tile_y_size;
505 break;
506 default:
507 break;
508 }
509
510 map->in_image_base_address += in_offset;
511 map->out_image_base_address += out_offset;
512 }
513
514 map->out_loop_1_reset = 0x1;
515 map->out_loop_2_reset = 0x0;
516 map->out_loop_3_reset = 0x1;
517 map->out_loop_0_inc = output_width * output_height;
518 map->out_loop_1_inc = 0x1;
519 map->out_loop_0_count = out_loop_0_count;
520 map->out_loop_1_count = out_dims[0];
521 map->out_loop_2_count = out_loop_0_count;
522 map->out_loop_3_count = out_dims[1];
523 map->out_loop_2_inc = map->out_loop_0_inc * 2;
524 map->out_loop_3_inc = output_width;
525 map->out_loop_6_inc = map->out_loop_0_inc * 4;
526
527 map->in_zp = operation->input_zero_point;
528 map->out_zp = operation->input_zero_point;
529
530 if (tp_cores_used > 1)
531 map->no_flush = tp_core < tp_cores_used - 1;
532
533 map->in_image_circular_buf_size = 0x0;
534 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
535 map->out_image_circular_buf_size = 0x0;
536 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
537
538 etna_bo_cpu_fini(bo);
539
540 return bo;
541 }
542
543
544 static void
split_pad(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used,unsigned * in_dims,unsigned * out_dims)545 split_pad(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
546 unsigned tp_core, unsigned tp_cores_used, unsigned *in_dims, unsigned *out_dims)
547 {
548 unsigned remaining_in_size;
549 unsigned dim_to_split = 2;
550
551 remaining_in_size = in_dims[dim_to_split];
552
553 for (unsigned i = 0; i <= tp_core; i++) {
554 unsigned size = DIV_ROUND_UP(remaining_in_size, (tp_cores_used - i));
555
556 if (i < tp_cores_used - 1) {
557 in_dims[dim_to_split] = size;
558 remaining_in_size -= in_dims[dim_to_split];
559 } else
560 in_dims[dim_to_split] = remaining_in_size;
561
562 out_dims[dim_to_split] = size;
563 }
564 }
565
566 static struct etna_bo *
create_pad_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)567 create_pad_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
568 unsigned tp_core, unsigned tp_cores_used)
569 {
570 struct pipe_context *pctx = subgraph->base.context;
571 struct etna_bo *bo = etna_ml_create_bo(pctx, sizeof(struct etna_tp_params));
572 unsigned input_width = operation->input_width;
573 unsigned input_height = operation->input_height;
574 unsigned input_channels = operation->input_channels;
575 unsigned output_width = operation->output_width;
576 unsigned output_height = operation->output_height;
577 unsigned output_channels = operation->output_channels;
578 unsigned in_dims[3];
579 unsigned out_dims[3];
580
581 SWAP(input_width, input_height);
582 SWAP(output_width, output_height);
583
584 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
585
586 struct etna_tp_params *map = etna_bo_map(bo);
587
588 set_default_tp_config(map);
589
590 in_dims[0] = input_width;
591 in_dims[1] = input_height;
592 in_dims[2] = input_channels;
593
594 out_dims[0] = output_width;
595 out_dims[1] = output_height;
596 out_dims[2] = output_channels;
597
598 split_pad(subgraph, operation, tp_core, tp_cores_used, in_dims, out_dims);
599
600 map->in_image_x_size = in_dims[0];
601 map->in_image_y_size = in_dims[1];
602 map->in_image_z_size = in_dims[2];
603
604 map->in_image_stride = input_width;
605 map->in_image_slice = input_width * input_height;
606
607 map->in_window_x_start = 0xffff;
608 map->in_window_y_start = 0xffff;
609
610 map->in_window_x_end = in_dims[0];
611 map->in_window_y_end = in_dims[1];
612 map->in_tile_x_size = out_dims[0];
613 map->in_tile_x_inc = out_dims[0];
614 map->in_tile_y_size = out_dims[1];
615 map->in_tile_y_inc = out_dims[1];
616
617 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
618 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
619 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
620
621 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
622 offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
623 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
624
625 for (unsigned i = 0; i < tp_core; i++) {
626 unsigned in_dims[3];
627 unsigned out_dims[3];
628 unsigned in_offset = 0;
629 unsigned out_offset = 0;
630
631 in_dims[0] = input_width;
632 in_dims[1] = input_height;
633 in_dims[2] = input_channels;
634
635 out_dims[0] = output_width;
636 out_dims[1] = output_height;
637 out_dims[2] = output_channels;
638
639 split_pad(subgraph, operation, i, tp_cores_used, in_dims, out_dims);
640
641 in_offset = map->in_image_slice * in_dims[2];
642 out_offset = out_dims[2];
643 out_offset *= map->in_tile_x_size * map->in_tile_y_size;
644
645 map->in_image_base_address += in_offset;
646 map->out_image_base_address += out_offset;
647 }
648
649 map->out_loop_1_reset = 0x0;
650 map->out_loop_2_reset = 0x0;
651 map->out_loop_3_reset = 0x0;
652 map->out_loop_0_inc = 0x0;
653 map->out_loop_1_inc = 0x1;
654 map->out_loop_0_count = 0x1;
655 map->out_loop_1_count = out_dims[0];
656 map->out_loop_2_count = out_dims[1];
657 map->out_loop_3_count = 0x1;
658 map->out_loop_2_inc = out_dims[0];
659 map->out_loop_3_inc = 0x0;
660 map->out_loop_6_inc = out_dims[0] * out_dims[1];
661
662 map->in_zp = operation->input_zero_point;
663 map->out_zp = operation->output_zero_point;
664
665 if (tp_cores_used > 1)
666 map->no_flush = tp_core < tp_cores_used - 1;
667
668 map->in_image_circular_buf_size = 0x0;
669 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
670 map->out_image_circular_buf_size = 0x0;
671 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
672
673 etna_bo_cpu_fini(bo);
674
675 return bo;
676 }
677
678 static inline uint8_t
etna_tensor_zero_point(const struct pipe_tensor * tensor)679 etna_tensor_zero_point(const struct pipe_tensor *tensor)
680 {
681 if (tensor->is_signed) {
682 assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
683 return tensor->zero_point + 128;
684 } else {
685 assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
686 return tensor->zero_point;
687 }
688 }
689
690 void
etna_ml_lower_transpose(struct etna_ml_subgraph * subgraph,const struct pipe_tensor * input_tensor,struct etna_operation * operation,unsigned * output_tensor)691 etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
692 const struct pipe_tensor *input_tensor,
693 struct etna_operation *operation,
694 unsigned *output_tensor)
695 {
696 operation->type = ETNA_JOB_TYPE_TP;
697 operation->tp_type = ETNA_ML_TP_TRANSPOSE;
698
699 operation->input_tensors[0] = input_tensor->index;
700 operation->input_count = 1;
701 operation->input_width = input_tensor->dims[1];
702 operation->input_height = input_tensor->dims[2];
703 operation->input_channels = input_tensor->dims[3];
704 operation->input_zero_point = etna_tensor_zero_point(input_tensor);
705 operation->input_scale = input_tensor->scale;
706 operation->input_tensor_sizes[0] = operation->input_width *
707 operation->input_height *
708 operation->input_channels;
709
710 *output_tensor = etna_ml_allocate_tensor(subgraph);
711 operation->output_tensors[0] = *output_tensor;
712 operation->output_width = operation->input_width;
713 operation->output_height = operation->input_height;
714 operation->output_channels = operation->input_channels;
715 operation->output_zero_point = operation->input_zero_point;
716 operation->output_scale = operation->input_scale;
717 operation->output_tensor_sizes[0] = operation->output_width *
718 operation->output_height *
719 operation->output_channels;
720 }
721
722 void
etna_ml_lower_detranspose(struct etna_ml_subgraph * subgraph,struct etna_operation * convolution,struct etna_operation * operation)723 etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
724 struct etna_operation *convolution,
725 struct etna_operation *operation)
726 {
727 operation->type = ETNA_JOB_TYPE_TP;
728 operation->tp_type = ETNA_ML_TP_DETRANSPOSE;
729
730 operation->input_tensors[0] = etna_ml_allocate_tensor(subgraph);
731 operation->input_count = 1;
732 operation->input_width = convolution->output_width;
733 operation->input_height = convolution->output_height;
734 operation->input_channels = convolution->output_channels;
735 operation->input_zero_point = convolution->output_zero_point;
736 operation->input_scale = convolution->output_scale;
737 operation->input_tensor_sizes[0] = operation->input_width *
738 operation->input_height *
739 operation->input_channels;
740
741 operation->output_tensors[0] = convolution->output_tensors[0];
742 operation->output_count = 1;
743 operation->output_width = convolution->output_width;
744 operation->output_height = convolution->output_height;
745 operation->output_channels = convolution->output_channels;
746 operation->output_zero_point = convolution->output_zero_point;
747 operation->output_scale = convolution->output_scale;
748 operation->output_tensor_sizes[0] = operation->output_width *
749 operation->output_height *
750 operation->output_channels;
751 }
752
753 void
etna_ml_lower_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * convolution,struct etna_operation * operation,unsigned * output_tensor)754 etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
755 const struct pipe_ml_operation *convolution,
756 struct etna_operation *operation,
757 unsigned *output_tensor)
758 {
759 operation->type = ETNA_JOB_TYPE_TP;
760 operation->tp_type = ETNA_ML_TP_RESHUFFLE;
761 operation->stride = convolution->conv.stride_x;
762 operation->padding_same = convolution->conv.padding_same;
763
764 operation->input_tensors[0] = convolution->input_tensors[0]->index;
765 operation->input_count = 1;
766 operation->input_width = convolution->input_tensors[0]->dims[1];
767 operation->input_height = convolution->input_tensors[0]->dims[2];
768 operation->input_channels = convolution->input_tensors[0]->dims[3];
769 operation->input_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
770 operation->input_scale = convolution->input_tensors[0]->scale;
771 operation->input_tensor_sizes[0] = operation->input_width *
772 operation->input_height *
773 operation->input_channels;
774
775 *output_tensor = etna_ml_allocate_tensor(subgraph);
776 operation->output_tensors[0] = *output_tensor;
777 operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
778 operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
779 operation->output_channels = operation->input_channels * operation->stride * operation->stride;
780 operation->output_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
781 operation->output_scale = convolution->input_tensors[0]->scale;
782 operation->output_tensor_sizes[0] = operation->output_width *
783 operation->output_height *
784 operation->output_channels;
785
786 /* When destriding a convolution, the transformation to be made to the input
787 * tensor will depend on the size of the weight tensor.
788 */
789 operation->weight_width = convolution->conv.weight_tensor->dims[1];
790 operation->weight_height = convolution->conv.weight_tensor->dims[2];
791
792 if (operation->padding_same) {
793 if (operation->weight_width == 5) {
794 operation->output_width += 2;
795 operation->output_height += 2;
796 } else {
797 operation->output_width += 1;
798 operation->output_height += 1;
799 }
800 }
801 }
802
803 void
etna_ml_lower_pad(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * pad,struct etna_operation * operation)804 etna_ml_lower_pad(struct etna_ml_subgraph *subgraph,
805 const struct pipe_ml_operation *pad,
806 struct etna_operation *operation)
807 {
808 operation->type = ETNA_JOB_TYPE_TP;
809 operation->tp_type = ETNA_ML_TP_PAD;
810 operation->stride = 1;
811
812 operation->input_tensors[0] = pad->input_tensors[0]->index;
813 operation->input_count = 1;
814 operation->input_width = pad->input_tensors[0]->dims[1];
815 operation->input_height = pad->input_tensors[0]->dims[2];
816 operation->input_channels = pad->input_tensors[0]->dims[3];
817 operation->input_tensor_sizes[0] = operation->input_width *
818 operation->input_height *
819 operation->input_channels;
820 operation->input_zero_point = pad->input_tensors[0]->zero_point;
821 operation->input_scale = pad->input_tensors[0]->scale;
822
823 operation->output_tensors[0] = pad->output_tensors[0]->index;
824 operation->output_width = pad->output_tensors[0]->dims[1];
825 operation->output_height = pad->output_tensors[0]->dims[2];
826 operation->output_channels = pad->output_tensors[0]->dims[3];
827 operation->output_zero_point = pad->output_tensors[0]->zero_point;
828 operation->output_scale = pad->output_tensors[0]->scale;
829 operation->output_tensor_sizes[0] = operation->output_width *
830 operation->output_height *
831 operation->output_channels;
832 }
833
834 void
etna_ml_compile_operation_tp(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)835 etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
836 const struct etna_operation *operation,
837 struct etna_vip_instruction *instruction)
838 {
839 struct etna_context *ctx = etna_context(subgraph->base.context);
840 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensors[0]);
841 assert(input);
842 pipe_resource_reference(&instruction->input, input);
843
844 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensors[0]);
845 assert(output);
846 pipe_resource_reference(&instruction->output, output);
847
848 instruction->input_offset = etna_ml_get_offset(subgraph, operation->input_tensors[0]);
849 instruction->output_offset = etna_ml_get_offset(subgraph, operation->output_tensors[0]);
850
851 switch (operation->tp_type) {
852 case ETNA_ML_TP_TRANSPOSE:
853 instruction->configs[0] = create_transpose_config(subgraph, operation);
854 break;
855 case ETNA_ML_TP_DETRANSPOSE:
856 instruction->configs[0] = create_detranspose_config(subgraph, operation);
857 break;
858 case ETNA_ML_TP_RESHUFFLE: {
859 unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
860 unsigned tp_cores_used;
861
862 tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
863
864 /* TODO: Run among the 4 cores for faster performance */
865 if ((operation->input_width == 320 || operation->input_width == 224) &&
866 operation->input_channels == 3)
867 tp_cores_used = 1;
868
869 ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
870 for (unsigned i = 0; i < tp_cores_used; i++) {
871 instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used);
872 }
873 break;
874 }
875 case ETNA_ML_TP_PAD: {
876 unsigned tp_cores_used = etna_ml_get_core_info(ctx)->tp_core_count;
877
878 ML_DBG("pad: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
879 for (unsigned i = 0; i < tp_cores_used; i++) {
880 instruction->configs[i] = create_pad_config(subgraph, operation, i, tp_cores_used);
881 }
882 break;
883 }
884 }
885 instruction->type = ETNA_JOB_TYPE_TP;
886 instruction->tp_type = operation->tp_type;
887 }
888
889 void
etna_ml_emit_operation_tp(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)890 etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
891 struct etna_vip_instruction *operation,
892 unsigned idx)
893 {
894 struct etna_context *ctx = etna_context(subgraph->base.context);
895 unsigned tp_core_count = etna_ml_get_core_info(ctx)->tp_core_count;
896 struct etna_cmd_stream *stream = ctx->stream;
897 bool more_than_one_tp_job = operation->configs[1] != NULL;
898 bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
899
900 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
901 unsigned offset = parallel ? idx + 1 : 0;
902
903 if (more_than_one_tp_job && (j < tp_core_count - 1))
904 offset = parallel ? 0x1f : 0x1;
905
906 etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
907 etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
908 etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0);
909
910 if (operation->tp_type == ETNA_ML_TP_PAD) {
911 etna_set_state(stream, VIVS_GL_UNK03950, j < tp_core_count - 1 ? 0x8 : 0x0);
912 } else {
913 etna_set_state(stream, VIVS_GL_UNK03950, 0x0);
914 }
915
916 etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) {
917 .bo = operation->configs[j],
918 .flags = ETNA_RELOC_READ,
919 .offset = offset,
920 });
921 }
922 etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0);
923 }
924