• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * Copyright (c) 2024 Pengutronix, Philipp Zabel
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <time.h>
8 #include "util/u_inlines.h"
9 #include "util/u_math.h"
10 #include "etnaviv_context.h"
11 #include "etnaviv_debug.h"
12 #include "etnaviv_ml.h"
13 #include "etnaviv_ml_nn.h"
14 #include "etnaviv_screen.h"
15 
16 static void *
map_resource(struct pipe_resource * resource)17 map_resource(struct pipe_resource *resource)
18 {
19    return etna_bo_map(etna_resource(resource)->bo);
20 }
21 
22 #define FIELD(field, bits) uint32_t field : bits;
23 
24 struct etna_nn_header_v8 {
25    FIELD(precode, 1)
26    FIELD(bit16, 1)
27    FIELD(fp16, 1)
28    FIELD(reserved1, 1)
29    FIELD(version, 4)
30 
31    uint8_t run_length_size;
32    uint8_t run_length_table[18];
33    uint32_t symbol_map;
34    uint16_t avg_bias;
35    uint16_t reserved2;
36    uint32_t stream_size[0];
37 };
38 
39 #define MAX_TILE_WIDTH 64
40 
41 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_x,unsigned tile_y,unsigned interleave_mode)42 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_x, unsigned tile_y, unsigned interleave_mode)
43 {
44    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
45    struct etna_core_info *info = etna_gpu_get_core_info(ctx->screen->npu);
46    unsigned nn_accum_buffer_depth = info->npu.nn_accum_buffer_depth;
47    unsigned output_channels = operation->output_channels;
48    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
49    unsigned tiles_per_core;
50 
51    if (operation->weight_width == 1)
52       tiles_per_core = nn_accum_buffer_depth / DIV_ROUND_UP(tile_y, interleave_mode);
53    else {
54       unsigned tile_size = DIV_ROUND_UP(DIV_ROUND_UP(tile_y * tile_x, operation->stride), 64);
55       tiles_per_core = nn_accum_buffer_depth / (tile_size * operation->stride);
56    }
57 
58    tiles_per_core = MIN2(tiles_per_core, (nn_accum_buffer_depth * 6) / 9);
59 
60    tiles_per_core = MIN2(tiles_per_core, kernels_per_core);
61    tiles_per_core = MIN2(tiles_per_core, 127);
62 
63    kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * tiles_per_core);
64    unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
65 
66    return DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
67 }
68 
69 static unsigned
calc_interleave_mode(struct etna_context * ctx,unsigned tile_width,unsigned weight_height)70 calc_interleave_mode(struct etna_context *ctx, unsigned tile_width, unsigned weight_height)
71 {
72    unsigned mode;
73 
74    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
75       return 1;
76 
77    if (tile_width <= MAX_TILE_WIDTH / 2) {
78       if (MAX_TILE_WIDTH / 4 < tile_width)
79          mode = 2;
80       else
81          mode = 4;
82    } else
83       mode = 1;
84 
85    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4) {
86       if (mode >= 2) {
87          return 2;
88       }
89    } else {
90       if (mode >= 4) {
91          return 4;
92       }
93    }
94 
95    if (tile_width <= MAX_TILE_WIDTH / 2) {
96       if (MAX_TILE_WIDTH / 4 < tile_width)
97          return 2;
98       else
99          return 4;
100    }
101 
102    return 1;
103 }
104 
105 unsigned
etna_ml_calculate_tiling_v8(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)106 etna_ml_calculate_tiling_v8(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
107 {
108    unsigned nn_input_buffer_depth = etna_ml_get_core_info(ctx)->nn_input_buffer_depth;
109    unsigned nn_accum_buffer_depth = etna_ml_get_core_info(ctx)->nn_accum_buffer_depth;
110    unsigned input_width = operation->input_width;
111    unsigned input_height = operation->input_height;
112    unsigned input_channels = operation->input_channels;
113    unsigned output_width = operation->output_width;
114    unsigned output_height = operation->output_height;
115    unsigned output_channels = operation->output_channels;
116    unsigned tile_width;
117    unsigned tile_height;
118    unsigned superblocks;
119    unsigned interleave_mode;
120 
121    if (operation->addition)
122       etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
123                                  &output_width, &output_height, &output_channels);
124 
125    if (operation->pooling_first_pixel) {
126       output_width *= 2;
127       output_height *= 2;
128    }
129 
130    tile_width = MIN2(output_width, 64);
131    interleave_mode = calc_interleave_mode(ctx, tile_width, operation->weight_height);
132 
133    tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
134    tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
135    tile_height = MIN2(tile_height, output_height);
136 
137    /* This gets us the best performance on MobileDet */
138    /* TODO: Find the optimal value, or at least let the user override it */
139    tile_height = MIN2(tile_height, 4);
140 
141    if (operation->stride > 1 && tile_height % 2 > 0)
142       tile_height -= 1;
143 
144    tile_height = MAX2(tile_height, 1);
145 
146    superblocks = calc_superblocks(ctx, operation, tile_width, tile_height, interleave_mode);
147 
148    if (tile_width_out)
149       *tile_width_out = tile_width;
150 
151    if (tile_height_out)
152       *tile_height_out = tile_height;
153 
154    return superblocks;
155 }
156 
157 static void
reorder_for_hw_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)158 reorder_for_hw_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
159 {
160    struct pipe_context *context = subgraph->base.context;
161    uint8_t *input = map_resource(operation->weight_tensor);
162    struct pipe_resource *output_res = etna_ml_create_resource(context, pipe_buffer_size(operation->weight_tensor));
163    uint8_t (*output)[operation->weight_width * operation->weight_height] = (void *)map_resource(output_res);
164 
165    for (int i = 0; i < operation->weight_height * operation->weight_width * operation->output_channels; i++) {
166       unsigned out_channel = i % operation->output_channels;
167 
168       output[out_channel][i / operation->output_channels] = input[i];
169    }
170 
171    pipe_resource_reference(&operation->weight_tensor, NULL);
172    operation->weight_tensor = output_res;
173 }
174 
175 struct bitstream {
176    unsigned bits_in_buffer;
177    uint64_t buffer;
178    uint32_t **map;
179    bool do_write;
180 };
181 
calculate_bias_correction(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint8_t * weights)182 static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *weights)
183 {
184    unsigned input_channels;
185    int32_t input_zero_point = 128 - operation->input_zero_point;
186    int32_t correction = 0;
187 
188    if (operation->depthwise)
189       input_channels = 1;
190    else if (operation->addition)
191       input_channels = 2 * operation->output_channels;
192    else
193       input_channels = operation->input_channels;
194 
195    if (operation->weight_signed) {
196       /* See etna_tensor_zero_point() */
197       int8_t weight_zero_point = operation->weight_zero_point - 128;
198 
199       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
200          correction += (((int8_t *)weights)[i] - weight_zero_point) * input_zero_point;
201       }
202    } else {
203       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
204          correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
205       }
206    }
207 
208    return correction;
209 }
210 
211 static void
append_bits(uint32_t value,size_t size,struct bitstream * bitstream)212 append_bits(uint32_t value, size_t size, struct bitstream *bitstream)
213 {
214    assert(value < 1 << size);
215    if (!size)
216       return;
217    bitstream->buffer |= (uint64_t)value << bitstream->bits_in_buffer;
218    bitstream->bits_in_buffer += size;
219    if (bitstream->bits_in_buffer >= 32) {
220       if (bitstream->do_write)
221          **bitstream->map = bitstream->buffer & 0xffffffff;
222       *bitstream->map += 1;
223       bitstream->buffer >>= 32;
224       bitstream->bits_in_buffer -= 32;
225    }
226 }
227 
228 static void
flush_bits(struct bitstream * bitstream)229 flush_bits(struct bitstream *bitstream)
230 {
231    if (bitstream->bits_in_buffer > 0)
232       append_bits(0, 32 - bitstream->bits_in_buffer, bitstream);
233 }
234 
235 struct wb_stream {
236    struct bitstream bitstream;
237    unsigned zero_point;
238    unsigned zrl_bits;
239    unsigned accum_zeroes;
240 };
241 
242 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)243 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
244 {
245    struct bitstream *bitstream = &wb_stream->bitstream;
246 
247    if (wb_stream->accum_zeroes == 0)
248       return;
249 
250    append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, bitstream);
251    wb_stream->accum_zeroes = 0;
252    append_bits(wb_stream->zero_point, 8, bitstream);
253 }
254 
255 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)256 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
257 {
258    struct bitstream *bitstream = &wb_stream->bitstream;
259    unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
260 
261    if (wb_stream->zrl_bits == 0) {
262       append_bits(value, 8, bitstream);
263       return;
264    }
265 
266    if (wb_stream->accum_zeroes == max_zeroes) {
267       append_bits(max_zeroes, wb_stream->zrl_bits, bitstream);
268       wb_stream->accum_zeroes = 0;
269       append_bits(value, 8, bitstream);
270       return;
271    }
272 
273    if (value == wb_stream->zero_point) {
274       wb_stream->accum_zeroes++;
275       return;
276    }
277 
278    append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, bitstream);
279    wb_stream->accum_zeroes = 0;
280    append_bits(value, 8, bitstream);
281 }
282 
283 /*
284  * The V8 architecture Huffman stream decoder uses a fixed code book with 8
285  * entries to determine bit lengths of variable length values later in the bit
286  * stream. The 2 to 5-bit long codes are stored in fixed length 3-bit (plus
287  * optional 2-bit) fields:
288  *
289  *     code   symbol
290  *    --------------
291  *    00_       0
292  *    10_       1
293  *    111       2
294  *    110       3
295  *    011       4
296  *    010 1_    5
297  *    010 01    6
298  *    010 00    7
299  *
300  * The free bit (_) is used for the sign, if available, otherwise the sign
301  * is stored with the variable length value later in the bitstream. In ZRL
302  * encoding mode, where larger values are stored verbatim, this may also be
303  * the lsb of the value instead.. The decoder processes weights in pairs and
304  * is pipelined 3-deep:
305  *
306  * In each step, first two 3-bit codes are read, then up to two 2-bit codes
307  * that belong with (010) 3-bit codes from the previous step. The optional
308  * 2-bit codes from the previous step, together with the 3-bit codes from the
309  * step before that are used to decode two symbols that are mapped to two bit
310  * lengths for the two variable length values that are read next.
311  *
312  * Finally, the bit lengths, signs, and variable length values are used to
313  * calculate two weights.
314  */
315 
316 struct code {
317    /* fixed 3-bit code */
318    uint8_t part0;
319    /* optional 2-bit code, iff part0 == 0b010 */
320    uint8_t part1;
321    /* variable length value */
322    uint8_t part2;
323    /* bit length determined from part0, part1, and symbol-to-bitlength map */
324    uint8_t part2_len;
325 };
326 
327 struct encoder {
328    /* bit-length-to-huffman-symbol map */
329    uint8_t map[9];
330    /* ring buffer for 3 encoded weight pairs */
331    struct code code[6];
332    size_t bytes_read;
333    struct bitstream bitstream;
334    uint32_t *initial_ptr;
335    uint32_t *dest;
336    uint8_t accum_zeroes;
337    uint8_t avg_bias;
338    bool zrl;
339 };
340 
341 /* Calculate a histogram of bit lenghts. */
histogram_accumulate(size_t histogram[9],uint8_t * bytes,size_t len,bool zrl)342 static void histogram_accumulate(size_t histogram[9], uint8_t *bytes, size_t len, bool zrl)
343 {
344    for (size_t i = 0; i < len; i++) {
345       uint8_t num_bits = 0;
346       if (bytes[i]) {
347          bool sign = bytes[i] >> 7;
348          uint8_t value = bytes[i];
349          if (sign) {
350             value -= zrl;
351             value ^= 0xff;
352          }
353          num_bits = util_logbase2(value) + 1;
354       }
355       assert(num_bits <= 8);
356       histogram[num_bits]++;
357    }
358 }
359 
360 /*
361  * value can be 8-bit raw value or variable length value with prepended sign.
362  * num_bits is number of bits in value, including the sign bit.
363  */
huffman_code(uint8_t sym,uint8_t value,uint8_t num_bits)364 static struct code huffman_code(uint8_t sym, uint8_t value, uint8_t num_bits)
365 {
366    switch (sym) {
367    case 0:
368       return (struct code){ 0 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
369    case 1:
370       return (struct code){ 1 | ((value & 1) << 2), 0, value >> 1, num_bits - 1 };
371    case 2:
372       return (struct code){ 7, 0, value, num_bits};
373    case 3:
374       return (struct code){ 3, 0, value, num_bits};
375    case 4:
376       return (struct code){ 6, 0, value, num_bits};
377    case 5:
378       return (struct code){ 2, 1 | ((value & 1) << 1), value >> 1, num_bits - 1 };
379    case 6:
380       return (struct code){ 2, 2, value, num_bits};
381    case 7:
382       return (struct code){ 2, 0, value, num_bits};
383    default:
384       return (struct code){};
385    }
386 }
387 
emit_pair(struct encoder * encoder)388 static void emit_pair(struct encoder *encoder)
389 {
390    struct bitstream *bitstream = &encoder->bitstream;
391    struct code *code = &encoder->code[(encoder->bytes_read - 2) % 6];
392 
393    append_bits(code[0].part0, 3, bitstream);
394    append_bits(code[1].part0, 3, bitstream);
395    if (encoder->bytes_read > 2) {
396       code = &encoder->code[(encoder->bytes_read - 4) % 6];
397       append_bits(code[0].part1, code[0].part0 == 2 ? 2 : 0, bitstream);
398       append_bits(code[1].part1, code[1].part0 == 2 ? 2 : 0, bitstream);
399    }
400    if (encoder->bytes_read > 4) {
401       code = &encoder->code[(encoder->bytes_read - 6) % 6];
402       append_bits(code[0].part2, code[0].part2_len, bitstream);
403       append_bits(code[1].part2, code[1].part2_len, bitstream);
404    }
405 }
406 
407 /* Encode a single byte. Emit into the bitstream when a pair is complete. */
encode_byte(struct encoder * encoder,uint8_t byte)408 static void encode_byte(struct encoder *encoder, uint8_t byte)
409 {
410    bool zrl = encoder->zrl;
411    bool sign = byte >> 7;
412    uint8_t value = byte;
413 
414    if (sign) {
415       value -= zrl;
416       value ^= 0xff;
417    }
418 
419    uint8_t msb = util_logbase2(value);
420    uint8_t num_bits = value ? (msb + 1) : 0;
421    value &= ~(1 << msb);
422    uint8_t sym = encoder->map[num_bits];
423    if (zrl && byte == 0) {
424       if (encoder->accum_zeroes <= 1) {
425          // this seems to be used for the non-repeated 0 at the beginning and end
426          sym = encoder->map[7];
427          num_bits = 8;
428       } else {
429          // FIXME - how to encode run length into the run length table?
430          num_bits = 1;
431       }
432    }
433    if (!zrl && num_bits == 0) {
434       num_bits = 1;
435    }
436    if (sym == 255 || (zrl && byte == 128)) {
437       // if there is no huffman code assigned to this bit length, or when
438       // encoding 0x80 in ZRL mode, dump the value into the bitstream verbatim.
439       sym = encoder->map[7];
440       value = byte;
441       num_bits = 8;
442    } else if (zrl && num_bits == 7) {
443       value = byte;
444       num_bits = 8;
445    } else {
446       value = (value << 1) | sign;
447    }
448    unsigned int i = encoder->bytes_read % 6;
449    encoder->code[i] = huffman_code(sym, value, num_bits);
450    encoder->bytes_read++;
451    if ((encoder->bytes_read & 1) == 0)
452       emit_pair(encoder);
453 }
454 
455 static void
encode_value(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,uint8_t value)456 encode_value(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, uint8_t value)
457 {
458    struct pipe_context *context = subgraph->base.context;
459    struct etna_context *ctx = etna_context(context);
460    unsigned customer_id = ctx->screen->info->customer_id;
461    uint8_t zero_point = operation->weight_zero_point;
462 
463    value -= encoder->avg_bias;
464 
465    if (customer_id == 0x99) {
466       if (encoder->zrl) {
467          if (encoder->avg_bias > 0) {
468             if (value == zero_point) {
469                encoder->accum_zeroes++;
470                return;
471             } else if (encoder->accum_zeroes) {
472                encode_byte(encoder, zero_point);
473                encoder->accum_zeroes = 0;
474             }
475          } else {
476             if (value == 0x0) {
477                encoder->accum_zeroes++;
478                return;
479             } else if (encoder->accum_zeroes) {
480                encode_byte(encoder, 0x80);
481                encoder->accum_zeroes = 0;
482             }
483          }
484       }
485 
486       encode_byte(encoder, value);
487    } else {
488       if (encoder->zrl) {
489          if (value == zero_point) {
490             encoder->accum_zeroes++;
491             return;
492          } else if (encoder->accum_zeroes) {
493             encode_byte(encoder, 0x00);
494             encoder->accum_zeroes = 0;
495          }
496       }
497 
498       encode_byte(encoder, value - zero_point);
499    }
500 }
501 
encoder_init(struct encoder * encoder,uint8_t * map,uint32_t * initial_ptr)502 static void encoder_init(struct encoder *encoder, uint8_t *map, uint32_t *initial_ptr)
503 {
504    memset(encoder, 0, sizeof(*encoder));
505    encoder->initial_ptr = initial_ptr;
506    encoder->dest = initial_ptr;
507    encoder->bitstream.map = &encoder->dest;
508    encoder->bitstream.do_write = initial_ptr != NULL;
509 
510    for (int i = 0; i < 9; i++)
511       encoder->map[i] = 255;
512 
513    for (int i = 0; i < 8; i++) {
514       assert(map[i] < sizeof(encoder->map));
515       encoder->map[map[i]] = i;
516    }
517 }
518 
encode_uint32(struct encoder * encoder,uint32_t value)519 static void encode_uint32(struct encoder *encoder, uint32_t value)
520 {
521    encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
522    encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
523    encode_byte(encoder, ((value >> 16) & 0xff) - encoder->avg_bias);
524    encode_byte(encoder, ((value >> 24) & 0xff) - encoder->avg_bias);
525 }
526 
encode_uint16(struct encoder * encoder,uint32_t value)527 static void encode_uint16(struct encoder *encoder, uint32_t value)
528 {
529    encode_byte(encoder, (value & 0xff) - encoder->avg_bias);
530    encode_byte(encoder, ((value >> 8) & 0xff) - encoder->avg_bias);
531 }
532 
533 /*
534  * Flush remaining weights stuck in the encoder ring buffer and all bits
535  * in the bitstream FIFO. Return the total number of bits written.
536  */
encoder_flush(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder)537 static size_t encoder_flush(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
538 {
539    struct bitstream *bitstream = &encoder->bitstream;
540    size_t total_bits;
541    uint8_t flush_val = (encoder->bytes_read & 1) + 4;
542 
543    struct code code;
544    if (encoder->bytes_read & 1)
545       encode_byte(encoder, 0x0);
546 
547    code.part0 = (flush_val & 1) << 2;
548    code.part1 = 0x0;
549    code.part2 = 0x0;
550    code.part2_len = 0x0;
551    encoder->code[encoder->bytes_read++ % 6] = code;
552    encoder->code[encoder->bytes_read++ % 6] = code;
553    emit_pair(encoder);
554    encoder->code[encoder->bytes_read++ % 6] = code;
555    encoder->code[encoder->bytes_read++ % 6] = code;
556    emit_pair(encoder);
557 
558    total_bits = (*bitstream->map - encoder->initial_ptr) * 32 +
559                 bitstream->bits_in_buffer;
560 
561    int padding_bits = 0;
562    if (total_bits % (64 * 8) > 0)
563       padding_bits = (64 * 8) - total_bits % (64 * 8);
564 
565    while (padding_bits > 0) {
566       unsigned bits = padding_bits >= 32 ? 32 : padding_bits;
567       append_bits(0, bits, bitstream);
568       padding_bits -= bits;
569    }
570 
571    return total_bits;
572 }
573 
map_swap(uint8_t * map,int a,int b)574 static void map_swap(uint8_t *map, int a, int b)
575 {
576    uint8_t tmp = map[a];
577 
578    map[a] = map[b];
579    map[b] = tmp;
580 }
581 
582 /*
583  * Sort the Huffman symbol to bit length map according to the histogram of bit
584  * lengths, so that more common bit lengths are represented by shorter codes.
585  * FIXME - doesn't take into account zrl mode properly.
586  */
sort_map(uint8_t * map,size_t * histogram)587 static void sort_map(uint8_t *map, size_t *histogram)
588 {
589    const uint8_t network[19][2] = {
590       {0, 2}, {1, 3}, {4, 6}, {5, 7},
591       {0, 4}, {1, 5}, {2, 6}, {3, 7},
592       {0, 1}, {2, 3}, {4, 5}, {6, 7},
593       {2, 4}, {3, 5},
594       {1, 4}, {3, 6},
595       {1, 2}, {3 ,4}, {5, 6},
596    };
597 
598    for (int i = 0; i < 19; i++) {
599       int a = network[i][0];
600       int b = network[i][1];
601 
602       if (histogram[map[a]] < histogram[map[b]])
603          map_swap(map, a, b);
604    }
605 }
606 
encoder_reset(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder)607 static void encoder_reset(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder)
608 {
609    encoder->initial_ptr = *encoder->bitstream.map;
610    encoder->dest = encoder->initial_ptr;
611    encoder->bitstream.map = &encoder->dest;
612 
613    encoder->bitstream.buffer = 0;
614    encoder->bitstream.bits_in_buffer = 0;
615    encoder->bytes_read = 0;
616    memset(encoder->code, 0, sizeof(encoder->code));
617 }
618 
encode_superblock(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,unsigned kernels_in_superblock,unsigned first_channel)619 static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, unsigned kernels_in_superblock, unsigned first_channel)
620 {
621    struct pipe_context *pctx = subgraph->base.context;
622    struct etna_context *ctx = etna_context(pctx);
623    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
624    unsigned input_channels = operation->input_channels;
625    unsigned output_channels = operation->output_channels;
626    unsigned kernel_size;
627    uint8_t *weights = map_resource(operation->weight_tensor);
628    unsigned block_size;
629    unsigned blocks;
630 
631    if (operation->depthwise)
632       input_channels = 1;
633    else if (operation->addition)
634       input_channels = 2 * output_channels;
635 
636    kernel_size = input_channels * operation->weight_height * operation->weight_width;
637 
638    uint8_t (*weights_map)[kernel_size] = (void *)weights;
639 
640    if (operation->depthwise)
641       block_size = MAX2(operation->weight_height * operation->weight_width, 9);
642    else
643       block_size = 9;
644 
645    blocks = DIV_ROUND_UP(kernel_size, block_size);
646 
647    for (unsigned block = 0; block < blocks; block++) {
648       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
649          unsigned oc;
650 
651          if (operation->depthwise) {
652             oc = first_channel + kernel * nn_core_count;
653 
654             if (output_channels > 1 && oc >= (output_channels - output_channels % nn_core_count))
655                oc -= nn_core_count - output_channels % nn_core_count;
656          } else
657             oc = first_channel + kernel;
658 
659          for (unsigned kernel_idx = 0; kernel_idx < block_size; kernel_idx++) {
660             uint8_t weight;
661 
662             if (kernel_idx + block * block_size >= kernel_size)
663                weight = operation->weight_zero_point;
664             else if (operation->weight_signed)
665                weight = ((int8_t *)(weights_map[oc]))[kernel_idx + block * block_size] + 128;
666             else
667                weight = weights_map[oc][kernel_idx + block * block_size];
668 
669             encode_value(subgraph, operation, encoder, weight);
670          }
671 
672          if (operation->depthwise && block_size % 9)
673             for (unsigned i = 0; i < 9 - block_size % 9; i++)
674                encode_value(subgraph, operation, encoder, operation->weight_zero_point);
675       }
676    }
677 }
678 
pack_symbol_map(uint8_t map[8])679 static uint32_t pack_symbol_map(uint8_t map[8])
680 {
681    uint32_t ret = 0;
682 
683    for (int i = 0; i < 8; i++)
684       ret |= map[i] << (4 * i);
685 
686    return ret;
687 }
688 
689 static struct etna_bo *
create_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)690 create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
691 {
692    struct pipe_context *context = subgraph->base.context;
693    struct etna_context *ctx = etna_context(context);
694    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
695    unsigned input_channels = operation->input_channels;
696    unsigned output_channels = operation->output_channels;
697    unsigned cores_used = MIN2(output_channels, nn_core_count);
698    size_t max_size;
699 
700    if (operation->depthwise)
701       input_channels = 1;
702    else if (operation->addition)
703       input_channels = 2 * output_channels;
704 
705    unsigned header_size = 64;
706    unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
707    unsigned tail_size = 64;
708    max_size = header_size + cores_used * body_size + tail_size;
709 
710    return etna_ml_create_bo(context, max_size);
711 }
712 
713 static void
calculate_symbol_map(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint8_t * symbol_map)714 calculate_symbol_map(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint8_t *symbol_map)
715 {
716    unsigned input_channels = operation->input_channels;
717    unsigned output_channels = operation->output_channels;
718    uint8_t *input = map_resource(operation->weight_tensor);
719    size_t histogram[9] = {};
720 
721    if (operation->depthwise)
722       input_channels = 1;
723    else if (operation->addition)
724       input_channels = 2 * output_channels;
725 
726    uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
727    unsigned kernel_size = operation->weight_width * operation->weight_height * input_channels;
728    for (unsigned oc = 0; oc < output_channels; oc++)
729       histogram_accumulate(histogram, (uint8_t *)weights_map[oc], kernel_size, false);
730 
731    for (int i = 0; i < 8; i++)
732       symbol_map[i] = i;
733    sort_map(symbol_map, histogram);
734 }
735 
736 static void
fill_weights(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct encoder * encoder,struct etna_nn_header_v8 * header)737 fill_weights(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct encoder *encoder, struct etna_nn_header_v8 *header)
738 {
739    struct pipe_context *context = subgraph->base.context;
740    struct etna_context *ctx = etna_context(context);
741    unsigned output_channels = operation->output_channels;
742    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
743    unsigned cores_used = MIN2(output_channels, nn_core_count);
744    unsigned superblocks = etna_ml_calculate_tiling_v8(ctx, operation, NULL, NULL);
745    unsigned full_superblock = DIV_ROUND_UP(output_channels, nn_core_count * superblocks);
746 
747    unsigned channel_per_superblock[superblocks];
748    for (unsigned superblock = 0; superblock < superblocks; superblock++)
749       channel_per_superblock[superblock] = superblock * full_superblock * cores_used;
750 
751    for (unsigned core = 0; core < cores_used; core++) {
752       unsigned kernels_per_core = output_channels / cores_used;
753       if (core < output_channels % cores_used)
754          kernels_per_core++;
755 
756       encoder_reset(subgraph, operation, encoder);
757       encode_uint16(encoder, kernels_per_core);
758 
759       for (unsigned superblock = 0; superblock < superblocks; superblock++) {
760 
761          unsigned kernels_in_superblock = full_superblock;
762          if (superblock == superblocks - 1) {
763             unsigned remaining_channels = output_channels - cores_used * (superblocks - 1) * full_superblock;
764             kernels_in_superblock = remaining_channels / cores_used;
765             if (core < remaining_channels % cores_used)
766                kernels_in_superblock += 1;
767          }
768 
769          unsigned first_channel;
770          if (operation->depthwise)
771             first_channel = cores_used - core - 1 + cores_used * full_superblock * superblock;
772          else
773             first_channel = channel_per_superblock[superblock];
774 
775          encode_superblock(subgraph, operation, encoder, kernels_in_superblock, first_channel);
776 
777          channel_per_superblock[superblock] += kernels_in_superblock;
778       }
779 
780       unsigned actual_bits = encoder_flush(subgraph, operation, encoder);
781       header->stream_size[core] = actual_bits;
782    }
783 }
784 
785 static uint32_t *
fill_biases(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,uint32_t * map)786 fill_biases(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, uint32_t *map)
787 {
788    uint8_t *input = map_resource(operation->weight_tensor);
789    uint32_t *biases = map_resource(operation->bias_tensor);
790    unsigned input_channels = operation->input_channels;
791    unsigned output_channels = operation->output_channels;
792 
793    if (operation->depthwise)
794       input_channels = 1;
795    else if (operation->addition)
796       input_channels = 2 * output_channels;
797 
798    uint8_t (*weights_map)[input_channels][operation->weight_height][operation->weight_width] = (void *)input;
799    for (unsigned oc = 0; oc < output_channels; oc++) {
800       uint32_t corr = calculate_bias_correction(subgraph, operation, (uint8_t *)weights_map[oc]);
801 
802       *map = biases[oc] + corr;
803       map++;
804    }
805 
806    return map;
807 }
808 
809 struct etna_bo *
etna_ml_create_coeffs_v8(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)810 etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
811 {
812    struct etna_bo *bo = create_bo(subgraph, operation);
813    uint32_t *map = etna_bo_map(bo);
814    struct etna_nn_header_v8 *header = (struct etna_nn_header_v8 *)map;
815    struct encoder encoder;
816    uint8_t symbol_map[8];
817 
818    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
819 
820    calculate_symbol_map(subgraph, operation, symbol_map);
821    header->symbol_map = pack_symbol_map(symbol_map);
822    header->version = 1;
823 
824    map += ALIGN(sizeof(*header), 64) / 4;
825 
826    encoder_init(&encoder, symbol_map, map);
827 
828    fill_weights(subgraph, operation, &encoder, header);
829    map = fill_biases(subgraph, operation, encoder.dest);
830 
831    /* Size of the data that will go into the SRAM cache, header included */
832    *cache_size = (uint8_t*)map - (uint8_t*)etna_bo_map(bo);
833 
834    etna_bo_cpu_fini(bo);
835 
836    return bo;
837 }
838