1 /*
2 * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * Copyright (c) 2024 Pengutronix, Philipp Zabel
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <time.h>
8 #include "util/u_inlines.h"
9 #include "util/u_math.h"
10 #include "etnaviv_context.h"
11 #include "etnaviv_debug.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_nn.h"
14 #include "etnaviv_screen.h"
15
16 static void *
map_resource(struct pipe_resource * resource)17 map_resource(struct pipe_resource *resource)
18 {
19 return etna_bo_map(etna_resource(resource)->bo);
20 }
21
22 #define FIELD(field, bits) uint32_t field : bits;
23
24 struct etna_nn_header_v8 {
25 FIELD(precode, 1)
26 FIELD(bit16, 1)
27 FIELD(fp16, 1)
28 FIELD(reserved1, 1)
29 FIELD(version, 4)
30
31 uint8_t run_length_size;
32 uint8_t run_length_table[18];
33 uint32_t symbol_map;
34 uint16_t avg_bias;
35 uint16_t reserved2;
36 uint32_t stream_size[0];
37 };
38
39 #define MAX_TILE_WIDTH 64
40
41 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_x,unsigned tile_y,unsigned interleave_mode)42 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode)
43 {
44 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
45 struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu);
46 unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth;
47 unsigned output_channels = operation->output_channels;
48 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
49 unsigned tiles_per_core;
50
51 if (operation->weight_width == 1)
52 tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode);
53 else {
54 unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64);
55 tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride);
56 }
57
58 tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9);
59
60 tiles_per_core = MIN2(tiles_per_core, kernels_per_core);
61 tiles_per_core = MIN2(tiles_per_core, 127);
62
63 kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core);
64 unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
65
66 return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
67 }
68
69 static unsigned
calc_interleave_mode(struct etna_context * ctx,unsigned tile_width,unsigned weight_height)70 calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height)
71 {
72 unsigned mode;
73
74 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
75 return 1;
76
77 if (tile_width <= MAX_TILE_WIDTH / 2) {
78 if (MAX_TILE_WIDTH / 4 < tile_width)
79 mode = 2;
80 else
81 mode = 4;
82 } else
83 mode = 1;
84
85 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) {
86 if (mode >= 2) {
87 return 2;
88 }
89 } else {
90 if (mode >= 4) {
91 return 4;
92 }
93 }
94
95 if (tile_width <= MAX_TILE_WIDTH / 2) {
96 if (MAX_TILE_WIDTH / 4 < tile_width)
97 return 2;
98 else
99 return 4;
100 }
101
102 return 1;
103 }
104
105 unsigned
etna_ml_calculate_tiling_v8(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)106 etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
107 {
108 unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
109 unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
110 unsigned input_width = operation->input_width;
111 unsigned input_height = operation->input_height;
112 unsigned input_channels = operation->input_channels;
113 unsigned output_width = operation->output_width;
114 unsigned output_height = operation->output_height;
115 unsigned output_channels = operation->output_channels;
116 unsigned tile_width;
117 unsigned tile_height;
118 unsigned superblocks;
119 unsigned interleave_mode;
120
121 if (operation->addition)
122 etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
123 &output_width, &output_height, &output_channels);
124
125 if (operation->pooling_first_pixel) {
126 output_width *= 2;
127 output_height *= 2;
128 }
129
130 tile_width = MIN2(output_width, 64);
131 interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height);
132
133 tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
134 tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
135 tile_height = MIN2(tile_height, output_height);
136
137 /* This gets us the best performance on MobileDet */
138 /* TODO: Find the optimal value, or at least let the user override it */
139 tile_height = MIN2(tile_height, 4);
140
141 if (operation->stride > 1 && tile_height % 2 > 0)
142 tile_height -= 1;
143
144 tile_height = MAX2(tile_height, 1);
145
146 superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode);
147
148 if (tile_width_out)
149 *tile_width_out = tile_width;
150
151 if (tile_height_out)
152 *tile_height_out = tile_height;
153
154 return superblocks;
155 }
156
157 static void
reorder_for_hw_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)158 reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
159 {
160 struct pipe_context *context = subgraph->base.context;
161 uint8_t *input = map_resource(operation->weight_tensor);
162 struct pipe_resource *output_res = etna_ml_create_resource(context, pipe_buffer_size(operation->weight_tensor));
163 uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
164
165 for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
166 unsigned out_channel = i % operation->output_channels;
167
168 output[out_channel][i / operation->output_channels] = input[i];
169 }
170
171 pipe_resource_reference(&operation->weight_tensor, NULL);
172 operation->weight_tensor = output_res;
173 }
174
175 struct bitstream {
176 unsigned bits_in_buffer;
177 uint64_t buffer;
178 uint32_t **map;
179 bool do_write;
180 };
181
calculate_bias_correction(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint8_t * weights)182 static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights)
183 {
184 unsigned input_channels;
185 int32_t input_zero_point = 128 - operation->input_zero_point;
186 int32_t correction = 0;
187
188 if (operation->depthwise)
189 input_channels = 1;
190 else if (operation->addition)
191 input_channels = 2 * operation->output_channels;
192 else
193 input_channels = operation->input_channels;
194
195 if (operation->weight_signed) {
196 /* See etna_tensor_zero_point() */
197 int8_t weight_zero_point = operation->weight_zero_point - 128;
198
199 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
200 correction += (((int8_t *)weights)[i] - weight_zero_point) * input_zero_point;
201 }
202 } else {
203 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
204 correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
205 }
206 }
207
208 return correction;
209 }
210
211 static void
append_bits(uint32_t value,size_t size,struct bitstream * bitstream)212 append_bits(uint32_t value, size_t size, struct bitstream *bitstream)
213 {
214 assert(value < 1 << size);
215 if (!size)
216 return;
217 bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer;
218 bitstream->bits_in_buffer += size;
219 if (bitstream->bits_in_buffer >= 32) {
220 if (bitstream->do_write)
221 **bitstream->map = bitstream->buffer & 0xffffffff;
222 *bitstream->map += 1;
223 bitstream->buffer >>= 32;
224 bitstream->bits_in_buffer -= 32;
225 }
226 }
227
228 static void
flush_bits(struct bitstream * bitstream)229 flush_bits(struct bitstream *bitstream)
230 {
231 if (bitstream->bits_in_buffer > 0)
232 append_bits(0, 32 - bitstream->bits_in_buffer, bitstream);
233 }
234
235 struct wb_stream {
236 struct bitstream bitstream;
237 unsigned zero_point;
238 unsigned zrl_bits;
239 unsigned accum_zeroes;
240 };
241
242 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)243 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
244 {
245 struct bitstream *bitstream = &wb_stream->bitstream;
246
247 if (wb_stream->accum_zeroes == 0)
248 return;
249
250 append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream);
251 wb_stream->accum_zeroes = 0;
252 append_bits(wb_stream->zero_point, 8, bitstream);
253 }
254
255 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)256 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
257 {
258 struct bitstream *bitstream = &wb_stream->bitstream;
259 unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
260
261 if (wb_stream->zrl_bits == 0) {
262 append_bits(value, 8, bitstream);
263 return;
264 }
265
266 if (wb_stream->accum_zeroes == max_zeroes) {
267 append_bits(max_zeroes, wb_stream->zrl_bits, bitstream);
268 wb_stream->accum_zeroes = 0;
269 append_bits(value, 8, bitstream);
270 return;
271 }
272
273 if (value == wb_stream->zero_point) {
274 wb_stream->accum_zeroes++;
275 return;
276 }
277
278 append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream);
279 wb_stream->accum_zeroes = 0;
280 append_bits(value, 8, bitstream);
281 }
282
283 /*
284 * The V8 architecture Huffman stream decoder uses a fixed code book with 8
285 * entries to determine bit lengths of variable length values later in the bit
286 * stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus
287 * optional 2-bit) fields:
288 *
289 * code symbol
290 * --------------
291 * 00_ 0
292 * 10_ 1
293 * 111 2
294 * 110 3
295 * 011 4
296 * 010 1_ 5
297 * 010 01 6
298 * 010 00 7
299 *
300 * The free bit (_) is used for the sign, if available, otherwise the sign
301 * is stored with the variable length value later in the bitstream. In ZRL
302 * encoding mode, where larger values are stored verbatim, this may also be
303 * the lsb of the value instead.. The decoder processes weights in pairs and
304 * is pipelined 3-deep:
305 *
306 * In each step, first two 3-bit codes are read, then up to two 2-bit codes
307 * that belong with (010) 3-bit codes from the previous step. The optional
308 * 2-bit codes from the previous step, together with the 3-bit codes from the
309 * step before that are used to decode two symbols that are mapped to two bit
310 * lengths for the two variable length values that are read next.
311 *
312 * Finally, the bit lengths, signs, and variable length values are used to
313 * calculate two weights.
314 */
315
316 struct code {
317 /* fixed 3-bit code */
318 uint8_t part0;
319 /* optional 2-bit code, iff part0 == 0b010 */
320 uint8_t part1;
321 /* variable length value */
322 uint8_t part2;
323 /* bit length determined from part0, part1, and symbol-to-bitlength map */
324 uint8_t part2_len;
325 };
326
327 struct encoder {
328 /* bit-length-to-huffman-symbol map */
329 uint8_t map[9];
330 /* ring buffer for 3 encoded weight pairs */
331 struct code code[6];
332 size_t bytes_read;
333 struct bitstream bitstream;
334 uint32_t *initial_ptr;
335 uint32_t *dest;
336 uint8_t accum_zeroes;
337 uint8_t avg_bias;
338 bool zrl;
339 };
340
341 /* Calculate a histogram of bit lenghts. */
histogram_accumulate(size_t histogram[9],uint8_t * bytes,size_t len,bool zrl)342 static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl)
343 {
344 for (size_t i = 0; i < len; i++) {
345 uint8_t num_bits = 0;
346 if (bytes[i]) {
347 bool sign = bytes[i] >> 7;
348 uint8_t value = bytes[i];
349 if (sign) {
350 value -= zrl;
351 value ^= 0xff;
352 }
353 num_bits = util_logbase2(value) + 1;
354 }
355 assert(num_bits <= 8);
356 histogram[num_bits]++;
357 }
358 }
359
360 /*
361 * value can be 8-bit raw value or variable length value with prepended sign.
362 * num_bits is number of bits in value, including the sign bit.
363 */
huffman_code(uint8_t sym,uint8_t value,uint8_t num_bits)364 static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits)
365 {
366 switch (sym) {
367 case 0:
368 return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
369 case 1:
370 return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
371 case 2:
372 return (struct code){ 7, 0, value, num_bits};
373 case 3:
374 return (struct code){ 3, 0, value, num_bits};
375 case 4:
376 return (struct code){ 6, 0, value, num_bits};
377 case 5:
378 return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 };
379 case 6:
380 return (struct code){ 2, 2, value, num_bits};
381 case 7:
382 return (struct code){ 2, 0, value, num_bits};
383 default:
384 return (struct code){};
385 }
386 }
387
emit_pair(struct encoder * encoder)388 static void emit_pair(struct encoder *encoder)
389 {
390 struct bitstream *bitstream = &encoder->bitstream;
391 struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6];
392
393 append_bits(code[0].part0, 3, bitstream);
394 append_bits(code[1].part0, 3, bitstream);
395 if (encoder->bytes_read > 2) {
396 code = &encoder->code[(encoder->bytes_read - 4) % 6];
397 append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream);
398 append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream);
399 }
400 if (encoder->bytes_read > 4) {
401 code = &encoder->code[(encoder->bytes_read - 6) % 6];
402 append_bits(code[0].part2, code[0].part2_len, bitstream);
403 append_bits(code[1].part2, code[1].part2_len, bitstream);
404 }
405 }
406
407 /* Encode a single byte. Emit into the bitstream when a pair is complete. */
encode_byte(struct encoder * encoder,uint8_t byte)408 static void encode_byte(struct encoder *encoder, uint8_t byte)
409 {
410 bool zrl = encoder->zrl;
411 bool sign = byte >> 7;
412 uint8_t value = byte;
413
414 if (sign) {
415 value -= zrl;
416 value ^= 0xff;
417 }
418
419 uint8_t msb = util_logbase2(value);
420 uint8_t num_bits = value ? (msb + 1) : 0;
421 value &= ~(1 << msb);
422 uint8_t sym = encoder->map[num_bits];
423 if (zrl && byte == 0) {
424 if (encoder->accum_zeroes <= 1) {
425 // this seems to be used for the non-repeated 0 at the beginning and end
426 sym = encoder->map[7];
427 num_bits = 8;
428 } else {
429 // FIXME - how to encode run length into the run length table?
430 num_bits = 1;
431 }
432 }
433 if (!zrl && num_bits == 0) {
434 num_bits = 1;
435 }
436 if (sym == 255 || (zrl && byte == 128)) {
437 // if there is no huffman code assigned to this bit length, or when
438 // encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim.
439 sym = encoder->map[7];
440 value = byte;
441 num_bits = 8;
442 } else if (zrl && num_bits == 7) {
443 value = byte;
444 num_bits = 8;
445 } else {
446 value = (value << 1) | sign;
447 }
448 unsigned int i = encoder->bytes_read % 6;
449 encoder->code[i] = huffman_code(sym, value, num_bits);
450 encoder->bytes_read++;
451 if ((encoder->bytes_read & 1) == 0)
452 emit_pair(encoder);
453 }
454
455 static void
encode_value(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,uint8_t value)456 encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value)
457 {
458 struct pipe_context *context = subgraph->base.context;
459 struct etna_context *ctx = etna_context(context);
460 unsigned customer_id = ctx->screen->info->customer_id;
461 uint8_t zero_point = operation->weight_zero_point;
462
463 value -= encoder->avg_bias;
464
465 if (customer_id == 0x99) {
466 if (encoder->zrl) {
467 if (encoder->avg_bias > 0) {
468 if (value == zero_point) {
469 encoder->accum_zeroes++;
470 return;
471 } else if (encoder->accum_zeroes) {
472 encode_byte(encoder, zero_point);
473 encoder->accum_zeroes = 0;
474 }
475 } else {
476 if (value == 0x0) {
477 encoder->accum_zeroes++;
478 return;
479 } else if (encoder->accum_zeroes) {
480 encode_byte(encoder, 0x80);
481 encoder->accum_zeroes = 0;
482 }
483 }
484 }
485
486 encode_byte(encoder, value);
487 } else {
488 if (encoder->zrl) {
489 if (value == zero_point) {
490 encoder->accum_zeroes++;
491 return;
492 } else if (encoder->accum_zeroes) {
493 encode_byte(encoder, 0x00);
494 encoder->accum_zeroes = 0;
495 }
496 }
497
498 encode_byte(encoder, value - zero_point);
499 }
500 }
501
encoder_init(struct encoder * encoder,uint8_t * map,uint32_t * initial_ptr)502 static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr)
503 {
504 memset(encoder, 0, sizeof(*encoder));
505 encoder->initial_ptr = initial_ptr;
506 encoder->dest = initial_ptr;
507 encoder->bitstream.map = &encoder->dest;
508 encoder->bitstream.do_write = initial_ptr != NULL;
509
510 for (int i = 0; i < 9; i++)
511 encoder->map[i] = 255;
512
513 for (int i = 0; i < 8; i++) {
514 assert(map[i] < sizeof(encoder->map));
515 encoder->map[map[i]] = i;
516 }
517 }
518
encode_uint32(struct encoder * encoder,uint32_t value)519 static void encode_uint32(struct encoder *encoder, uint32_t value)
520 {
521 encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
522 encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
523 encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias);
524 encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias);
525 }
526
encode_uint16(struct encoder * encoder,uint32_t value)527 static void encode_uint16(struct encoder *encoder, uint32_t value)
528 {
529 encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
530 encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
531 }
532
533 /*
534 * Flush remaining weights stuck in the encoder ring buffer and all bits
535 * in the bitstream FIFO. Return the total number of bits written.
536 */
encoder_flush(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder)537 static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
538 {
539 struct bitstream *bitstream = &encoder->bitstream;
540 size_t total_bits;
541 uint8_t flush_val = (encoder->bytes_read & 1) + 4;
542
543 struct code code;
544 if (encoder->bytes_read & 1)
545 encode_byte(encoder, 0x0);
546
547 code.part0 = (flush_val & 1) << 2;
548 code.part1 = 0x0;
549 code.part2 = 0x0;
550 code.part2_len = 0x0;
551 encoder->code[encoder->bytes_read++ % 6] = code;
552 encoder->code[encoder->bytes_read++ % 6] = code;
553 emit_pair(encoder);
554 encoder->code[encoder->bytes_read++ % 6] = code;
555 encoder->code[encoder->bytes_read++ % 6] = code;
556 emit_pair(encoder);
557
558 total_bits = (*bitstream->map - encoder->initial_ptr) * 32 +
559 bitstream->bits_in_buffer;
560
561 int padding_bits = 0;
562 if (total_bits % (64 * 8) > 0)
563 padding_bits = (64 * 8) - total_bits % (64 * 8);
564
565 while (padding_bits > 0) {
566 unsigned bits = padding_bits >= 32 ? 32 : padding_bits;
567 append_bits(0, bits, bitstream);
568 padding_bits -= bits;
569 }
570
571 return total_bits;
572 }
573
map_swap(uint8_t * map,int a,int b)574 static void map_swap(uint8_t *map, int a, int b)
575 {
576 uint8_t tmp = map[a];
577
578 map[a] = map[b];
579 map[b] = tmp;
580 }
581
582 /*
583 * Sort the Huffman symbol to bit length map according to the histogram of bit
584 * lengths, so that more common bit lengths are represented by shorter codes.
585 * FIXME - doesn't take into account zrl mode properly.
586 */
sort_map(uint8_t * map,size_t * histogram)587 static void sort_map(uint8_t *map, size_t *histogram)
588 {
589 const uint8_t network[19][2] = {
590 {0, 2}, {1, 3}, {4, 6}, {5, 7},
591 {0, 4}, {1, 5}, {2, 6}, {3, 7},
592 {0, 1}, {2, 3}, {4, 5}, {6, 7},
593 {2, 4}, {3, 5},
594 {1, 4}, {3, 6},
595 {1, 2}, {3 ,4}, {5, 6},
596 };
597
598 for (int i = 0; i < 19; i++) {
599 int a = network[i][0];
600 int b = network[i][1];
601
602 if (histogram[map[a]] < histogram[map[b]])
603 map_swap(map, a, b);
604 }
605 }
606
encoder_reset(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder)607 static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
608 {
609 encoder->initial_ptr = *encoder->bitstream.map;
610 encoder->dest = encoder->initial_ptr;
611 encoder->bitstream.map = &encoder->dest;
612
613 encoder->bitstream.buffer = 0;
614 encoder->bitstream.bits_in_buffer = 0;
615 encoder->bytes_read = 0;
616 memset(encoder->code, 0, sizeof(encoder->code));
617 }
618
encode_superblock(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,unsigned kernels_in_superblock,unsigned first_channel)619 static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel)
620 {
621 struct pipe_context *pctx = subgraph->base.context;
622 struct etna_context *ctx = etna_context(pctx);
623 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
624 unsigned input_channels = operation->input_channels;
625 unsigned output_channels = operation->output_channels;
626 unsigned kernel_size;
627 uint8_t *weights = map_resource(operation->weight_tensor);
628 unsigned block_size;
629 unsigned blocks;
630
631 if (operation->depthwise)
632 input_channels = 1;
633 else if (operation->addition)
634 input_channels = 2 * output_channels;
635
636 kernel_size = input_channels * operation->weight_height * operation->weight_width;
637
638 uint8_t (*weights_map)[kernel_size] = (void *)weights;
639
640 if (operation->depthwise)
641 block_size = MAX2(operation->weight_height * operation->weight_width, 9);
642 else
643 block_size = 9;
644
645 blocks = DIV_ROUND_UP(kernel_size, block_size);
646
647 for (unsigned block = 0; block < blocks; block++) {
648 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
649 unsigned oc;
650
651 if (operation->depthwise) {
652 oc = first_channel + kernel * nn_core_count;
653
654 if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count))
655 oc -= nn_core_count - output_channels % nn_core_count;
656 } else
657 oc = first_channel + kernel;
658
659 for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) {
660 uint8_t weight;
661
662 if (kernel_idx + block * block_size >= kernel_size)
663 weight = operation->weight_zero_point;
664 else if (operation->weight_signed)
665 weight = ((int8_t *)(weights_map[oc]))[kernel_idx + block * block_size] + 128;
666 else
667 weight = weights_map[oc][kernel_idx + block * block_size];
668
669 encode_value(subgraph, operation, encoder, weight);
670 }
671
672 if (operation->depthwise && block_size % 9)
673 for (unsigned i = 0; i < 9 - block_size % 9; i++)
674 encode_value(subgraph, operation, encoder, operation->weight_zero_point);
675 }
676 }
677 }
678
pack_symbol_map(uint8_t map[8])679 static uint32_t pack_symbol_map(uint8_t map[8])
680 {
681 uint32_t ret = 0;
682
683 for (int i = 0; i < 8; i++)
684 ret |= map[i] << (4 * i);
685
686 return ret;
687 }
688
689 static struct etna_bo *
create_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)690 create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
691 {
692 struct pipe_context *context = subgraph->base.context;
693 struct etna_context *ctx = etna_context(context);
694 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
695 unsigned input_channels = operation->input_channels;
696 unsigned output_channels = operation->output_channels;
697 unsigned cores_used = MIN2(output_channels, nn_core_count);
698 size_t max_size;
699
700 if (operation->depthwise)
701 input_channels = 1;
702 else if (operation->addition)
703 input_channels = 2 * output_channels;
704
705 unsigned header_size = 64;
706 unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
707 unsigned tail_size = 64;
708 max_size = header_size + cores_used * body_size + tail_size;
709
710 return etna_ml_create_bo(context, max_size);
711 }
712
713 static void
calculate_symbol_map(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint8_t * symbol_map)714 calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map)
715 {
716 unsigned input_channels = operation->input_channels;
717 unsigned output_channels = operation->output_channels;
718 uint8_t *input = map_resource(operation->weight_tensor);
719 size_t histogram[9] = {};
720
721 if (operation->depthwise)
722 input_channels = 1;
723 else if (operation->addition)
724 input_channels = 2 * output_channels;
725
726 uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
727 unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels;
728 for (unsigned oc = 0; oc < output_channels; oc++)
729 histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false);
730
731 for (int i = 0; i < 8; i++)
732 symbol_map[i] = i;
733 sort_map(symbol_map, histogram);
734 }
735
736 static void
fill_weights(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,struct etna_nn_header_v8 * header)737 fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header)
738 {
739 struct pipe_context *context = subgraph->base.context;
740 struct etna_context *ctx = etna_context(context);
741 unsigned output_channels = operation->output_channels;
742 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
743 unsigned cores_used = MIN2(output_channels, nn_core_count);
744 unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL);
745 unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
746
747 unsigned channel_per_superblock[superblocks];
748 for (unsigned superblock = 0; superblock < superblocks; superblock++)
749 channel_per_superblock[superblock] = superblock * full_superblock * cores_used;
750
751 for (unsigned core = 0; core < cores_used; core++) {
752 unsigned kernels_per_core = output_channels / cores_used;
753 if (core < output_channels % cores_used)
754 kernels_per_core++;
755
756 encoder_reset(subgraph, operation, encoder);
757 encode_uint16(encoder, kernels_per_core);
758
759 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
760
761 unsigned kernels_in_superblock = full_superblock;
762 if (superblock == superblocks - 1) {
763 unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock;
764 kernels_in_superblock = remaining_channels / cores_used;
765 if (core < remaining_channels % cores_used)
766 kernels_in_superblock += 1;
767 }
768
769 unsigned first_channel;
770 if (operation->depthwise)
771 first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock;
772 else
773 first_channel = channel_per_superblock[superblock];
774
775 encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel);
776
777 channel_per_superblock[superblock] += kernels_in_superblock;
778 }
779
780 unsigned actual_bits = encoder_flush(subgraph, operation, encoder);
781 header->stream_size[core] = actual_bits;
782 }
783 }
784
785 static uint32_t *
fill_biases(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint32_t * map)786 fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map)
787 {
788 uint8_t *input = map_resource(operation->weight_tensor);
789 uint32_t *biases = map_resource(operation->bias_tensor);
790 unsigned input_channels = operation->input_channels;
791 unsigned output_channels = operation->output_channels;
792
793 if (operation->depthwise)
794 input_channels = 1;
795 else if (operation->addition)
796 input_channels = 2 * output_channels;
797
798 uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
799 for (unsigned oc = 0; oc < output_channels; oc++) {
800 uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]);
801
802 *map = biases[oc] + corr;
803 map++;
804 }
805
806 return map;
807 }
808
809 struct etna_bo *
etna_ml_create_coeffs_v8(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)810 etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
811 {
812 struct etna_bo *bo = create_bo(subgraph, operation);
813 uint32_t *map = etna_bo_map(bo);
814 struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map;
815 struct encoder encoder;
816 uint8_t symbol_map[8];
817
818 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
819
820 calculate_symbol_map(subgraph, operation, symbol_map);
821 header->symbol_map = pack_symbol_map(symbol_map);
822 header->version = 1;
823
824 map += ALIGN(sizeof(*header), 64) / 4;
825
826 encoder_init(&encoder, symbol_map, map);
827
828 fill_weights(subgraph, operation, &encoder, header);
829 map = fill_biases(subgraph, operation, encoder.dest);
830
831 /* Size of the data that will go into the SRAM cache, header included */
832 *cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo);
833
834 etna_bo_cpu_fini(bo);
835
836 return bo;
837 }
838