• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "etnaviv_context.h"
7 #include "etnaviv_debug.h"
8 #include "etnaviv_ml_nn.h"
9 #include "etnaviv_screen.h"
10 
11 static void *
map_resource(struct pipe_resource * resource)12 map_resource(struct pipe_resource *resource)
13 {
14    return etna_bo_map(etna_resource(resource)->bo);
15 }
16 
17 #define MAX_TILE_WIDTH 64
18 
19 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)20 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
21 {
22    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
23    unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
24    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
25    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
26    unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
27 
28    if (operation->weight_width == 1)
29       foo = MIN2(foo, nn_accum_buffer_depth / 3);
30 
31    foo = MIN2(foo, kernels_per_core);
32    foo = MIN2(foo, 127);
33 
34    kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
35    unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
36    unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
37 
38    return superblocks;
39 }
40 
41 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)42 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
43 {
44    unsigned mode = 8;
45 
46    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
47       return 1;
48 
49    if (tile_width > MAX_TILE_WIDTH / 2)
50       mode = 1;
51    else if (tile_width > MAX_TILE_WIDTH / 4)
52       mode = 2;
53    else if (tile_width > MAX_TILE_WIDTH / 8)
54       mode = 4;
55 
56    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
57       return MIN2(mode, 4);
58 
59    return MIN2(mode, 2);
60 }
61 
62 unsigned
etna_ml_calculate_tiling_v7(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)63 etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
64 {
65    unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
66    unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
67    unsigned input_width = operation->input_width;
68    unsigned input_height = operation->input_height;
69    unsigned input_channels = operation->input_channels;
70    unsigned output_width = operation->output_width;
71    unsigned output_height = operation->output_height;
72    unsigned output_channels = operation->output_channels;
73    unsigned tile_width;
74    unsigned tile_height;
75    unsigned superblocks;
76    unsigned interleave_mode;
77 
78    if (operation->addition)
79       etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
80                                   &output_width, &output_height, &output_channels);
81 
82    if (operation->pooling_first_pixel) {
83       output_width *= 2;
84       output_height *= 2;
85    }
86 
87    tile_width = MIN2(output_width, 64);
88    interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
89 
90    tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
91    tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
92    tile_height = MIN2(tile_height, output_height);
93 
94    if (operation->stride > 1 && tile_height % 2 > 0)
95       tile_height -= 1;
96 
97    tile_height = MAX2(tile_height, 1);
98    superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
99 
100    if (tile_width_out)
101       *tile_width_out = tile_width;
102 
103    if (tile_height_out)
104       *tile_height_out = tile_height;
105 
106    return superblocks;
107 }
108 
109 static uint32_t
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)110 calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
111 {
112    int32_t correction = 0;
113 
114    for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
115       correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
116    }
117 
118    return correction;
119 }
120 
121 
122 static void
append_bits(uint32_t value,size_t size,unsigned * bits_in_buffer,uint64_t * buffer,uint32_t ** dest,bool do_write)123 append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
124 {
125    *buffer |= (uint64_t)value << *bits_in_buffer;
126    *bits_in_buffer += size;
127    if (*bits_in_buffer >= 32) {
128       if (do_write)
129          **dest = *buffer & 0xffffffff;
130       *dest += 1;
131       *buffer >>= 32;
132       *bits_in_buffer -= 32;
133    }
134 }
135 
136 struct wb_stream {
137    unsigned zero_point;
138    unsigned zrl_bits;
139    unsigned *bits_in_buffer;
140    uint64_t *buffer;
141    uint32_t **map;
142    bool do_write;
143 
144    unsigned accum_zeroes;
145 };
146 
147 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)148 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
149 {
150    if (wb_stream->accum_zeroes == 0)
151       return;
152 
153    append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
154    wb_stream->accum_zeroes = 0;
155    append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
156 }
157 
158 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)159 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
160 {
161    unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
162 
163    if (wb_stream->zrl_bits == 0) {
164       append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
165       return;
166    }
167 
168    if (wb_stream->accum_zeroes == max_zeroes) {
169       append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
170       wb_stream->accum_zeroes = 0;
171       append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
172       return;
173    }
174 
175    if (value == wb_stream->zero_point) {
176       wb_stream->accum_zeroes++;
177       return;
178    }
179 
180    append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
181    wb_stream->accum_zeroes = 0;
182    append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
183 }
184 
185 static unsigned
write_core_6(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)186 write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
187 {
188    struct pipe_context *pctx = subgraph->base.context;
189    unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
190    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
191    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
192    unsigned cores_used = MIN2(output_channels, nn_core_count);
193    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
194    uint8_t *input = map_resource(operation->weight_tensor);
195    uint32_t *biases = map_resource(operation->bias_tensor);
196    unsigned out_values_per_channel = operation->output_width * operation->output_height;
197    unsigned stride = MIN2(input_channels, 6);
198    unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
199    uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
200    uint32_t *initial_ptr = map;
201    bool do_write = initial_ptr != NULL;
202    uint64_t buffer = 0;
203    unsigned bits_in_buffer = 0;
204    struct wb_stream wb_stream = {
205       .zero_point = operation->weight_zero_point,
206       .zrl_bits = zrl_bits,
207       .bits_in_buffer = &bits_in_buffer,
208       .buffer = &buffer,
209       .map = &map,
210       .do_write = do_write,
211    };
212 
213    ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
214 
215    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
216    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
217 
218    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
219 
220       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
221       if (superblock == superblocks - 1)
222          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
223 
224       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
225          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
226          weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
227       }
228 
229       for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
230          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
231             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
232 
233             if (block == 0) {
234                wb_stream_write(&wb_stream, weights_maps[kernel][0]);
235 
236                uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
237                wb_stream_flush_zeroes(&wb_stream);
238                append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
239 
240                for (int i = 1; i < stride; i++) {
241                   wb_stream_write(&wb_stream, weights_maps[kernel][i]);
242                }
243             } else {
244                for (int i = 0; i < stride; i++) {
245                   if (i + block * stride < input_channels)
246                      wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
247                }
248             }
249             if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
250                wb_stream_flush_zeroes(&wb_stream);
251                append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
252             }
253          }
254       }
255    }
256 
257    wb_stream_flush_zeroes(&wb_stream);
258 
259    if (bits_in_buffer > 0)
260       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
261 
262    return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
263 }
264 
265 static unsigned
write_core_interleaved(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)266 write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
267 {
268    struct pipe_context *pctx = subgraph->base.context;
269    unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
270    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
271    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
272    unsigned cores_used = MIN2(output_channels, nn_core_count);
273    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
274    uint8_t *input = map_resource(operation->weight_tensor);
275    uint32_t *biases = map_resource(operation->bias_tensor);
276    unsigned out_values_per_channel = operation->output_width * operation->output_height;
277    unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
278    uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
279    uint32_t *initial_ptr = map;
280    bool do_write = initial_ptr != NULL;
281    uint64_t buffer = 0;
282    unsigned bits_in_buffer = 0;
283    struct wb_stream wb_stream = {
284       .zero_point = operation->weight_zero_point,
285       .zrl_bits = zrl_bits,
286       .bits_in_buffer = &bits_in_buffer,
287       .buffer = &buffer,
288       .map = &map,
289       .do_write = do_write,
290    };
291 
292    ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
293 
294    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
295    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
296 
297    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
298 
299       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
300       if (superblock == superblocks - 1)
301          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
302 
303       for (unsigned z = 0; z < input_channels; z++) {
304          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
305             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
306 
307             for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
308                unsigned stride = operation->weight_height;
309                if (operation->weight_height > 3)
310                   stride = 3;
311                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
312                   if (x >= operation->weight_width)
313                      break;
314                   for (unsigned y = 0; y < stride; y++) {
315                      wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
316                      if (x == 0 && y == 0 && z == 0) {
317                         uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
318                         wb_stream_flush_zeroes(&wb_stream);
319                         append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
320                      }
321                   }
322                }
323                if (operation->weight_height > 3) {
324                   for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
325                      if (x >= operation->weight_width)
326                         break;
327                      for (unsigned y = stride; y < operation->weight_width; y++) {
328                         wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
329                      }
330                   }
331                }
332             }
333 
334             if (z == input_channels - 1) {
335                wb_stream_flush_zeroes(&wb_stream);
336                append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
337             }
338          }
339          if (superblock == superblocks - 1)
340             wb_stream_flush_zeroes(&wb_stream);
341       }
342    }
343 
344    wb_stream_flush_zeroes(&wb_stream);
345 
346    if (bits_in_buffer > 0)
347       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
348 
349    return (uint8_t *)map - (uint8_t *)initial_ptr;
350 }
351 
352 static unsigned
write_core_sequential(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)353 write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
354 {
355    struct pipe_context *pctx = subgraph->base.context;
356    unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
357    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
358    unsigned cores_used = MIN2(output_channels, nn_core_count);
359    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
360    uint8_t *input = map_resource(operation->weight_tensor);
361    uint32_t *biases = map_resource(operation->bias_tensor);
362    unsigned out_values_per_channel = operation->output_width * operation->output_height;
363    unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
364    uint32_t *initial_ptr = map;
365    bool do_write = initial_ptr != NULL;
366    uint64_t buffer = 0;
367    unsigned bits_in_buffer = 0;
368    struct wb_stream wb_stream = {
369       .zero_point = operation->weight_zero_point,
370       .zrl_bits = zrl_bits,
371       .bits_in_buffer = &bits_in_buffer,
372       .buffer = &buffer,
373       .map = &map,
374       .do_write = do_write,
375    };
376 
377    ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
378 
379    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
380    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
381 
382    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
383 
384       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
385       if (superblock == superblocks - 1)
386          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
387 
388       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
389          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
390 
391          uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
392 
393          for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
394             unsigned stride = operation->weight_height;
395             if ((operation->depthwise || operation->input_width > 64) && \
396                operation->weight_height > 3)
397                stride = 3;
398             for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
399                if (x >= operation->weight_width)
400                   break;
401                for (unsigned y = 0; y < stride; y++) {
402 
403                   wb_stream_write(&wb_stream, weights_map[x][y]);
404                   if (x == 0 && y == 0) {
405                      uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
406                      wb_stream_flush_zeroes(&wb_stream);
407                      append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
408                   }
409                }
410             }
411             if ((operation->depthwise || operation->input_width > 64) && \
412                operation->weight_height > 3) {
413                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
414                   if (x >= operation->weight_width)
415                      break;
416                   for (unsigned y = stride; y < operation->weight_width; y++) {
417                      wb_stream_write(&wb_stream, weights_map[x][y]);
418                   }
419                }
420             }
421          }
422          wb_stream_flush_zeroes(&wb_stream);
423          if (operation->addition)
424             append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
425          else
426             append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
427       }
428    }
429 
430    wb_stream_flush_zeroes(&wb_stream);
431 
432    if (bits_in_buffer > 0)
433       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
434 
435    return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
436 }
437 
438 static unsigned
calculate_weight_bo_size(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)439 calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
440 {
441    struct pipe_context *context = subgraph->base.context;
442    struct etna_context *ctx = etna_context(context);
443    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
444    unsigned header_size = ALIGN(nn_core_count * 4, 64);
445    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
446    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
447    unsigned cores_used = MIN2(output_channels, nn_core_count);
448    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
449    unsigned weights_size;
450    unsigned core_size;
451    unsigned core_size_aligned;
452    unsigned compressed_size_aligned;
453 
454    weights_size = operation->weight_width * operation->weight_height * input_channels;
455    core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
456    core_size_aligned = ALIGN(core_size, 64);
457    compressed_size_aligned = header_size + core_size_aligned * cores_used;
458 
459    return compressed_size_aligned;
460 }
461 
462 static unsigned
calculate_zrl_bits(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)463 calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
464 {
465    struct pipe_context *context = subgraph->base.context;
466    struct etna_context *ctx = etna_context(context);
467    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
468    unsigned max_zrl_bits = etna_ml_get_core_info(ctx)->nn_zrl_bits;
469    unsigned header_size = ALIGN(nn_core_count * 4, 64);
470    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
471    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
472    unsigned cores_used = MIN2(output_channels, nn_core_count);
473    unsigned best_compressed_size;
474    unsigned best_zrl_bits;
475 
476    /* These are very unlikely to have enough zeroes for compression to be useful. */
477    if (operation->addition ||
478        operation->pointwise) {
479 
480       return 0;
481    }
482 
483    /* This calculation can be really slow. Start from max_zrl_bits as big
484     * buffers will benefit the most from high zero compression.
485     */
486    best_compressed_size = UINT_MAX;
487    best_zrl_bits = 0;
488    for (int zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
489 
490       unsigned compressed_size = header_size;
491       for (unsigned core = 0; core < cores_used; core++) {
492 
493          unsigned actual_size;
494          if (operation->pointwise && output_channels > 8)
495             actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
496          else if (input_channels > 1)
497             actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
498          else
499             actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
500 
501          compressed_size += actual_size;
502       }
503 
504       /* If more bits don't compress further, then stop */
505       if (compressed_size <= best_compressed_size) {
506          best_compressed_size = compressed_size;
507          best_zrl_bits = zrl_bits;
508       } else
509          break;
510    }
511 
512    return best_zrl_bits;
513 }
514 
515 struct etna_bo *
etna_ml_create_coeffs_v7(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)516 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
517 {
518    struct pipe_context *context = subgraph->base.context;
519    struct etna_context *ctx = etna_context(context);
520    unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
521    unsigned header_size = ALIGN(nn_core_count * 4, 64);
522    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
523    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
524    unsigned cores_used = MIN2(output_channels, nn_core_count);
525    unsigned zrl_bits;
526    unsigned max_core_size = 0;
527    unsigned bo_size;
528 
529    bo_size = calculate_weight_bo_size(subgraph, operation);
530    zrl_bits = calculate_zrl_bits(subgraph, operation);
531 
532    struct etna_bo *compressed = etna_ml_create_bo(context, bo_size);
533 
534    etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
535 
536    uint32_t *map = etna_bo_map(compressed);
537 
538    uint32_t *header = map;
539    map += header_size / 4;
540 
541    for (unsigned core = 0; core < cores_used; core++) {
542 
543       unsigned actual_size;
544       if (operation->pointwise && output_channels > 8)
545          actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
546       else if (input_channels > 1)
547          actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
548       else
549          actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);
550 
551       actual_size = ALIGN(actual_size, 64);
552       max_core_size = MAX2(actual_size, max_core_size);
553 
554       header[core] = actual_size;
555 
556       map += actual_size / 4;
557    }
558 
559    etna_bo_cpu_fini(compressed);
560 
561    *cache_size = max_core_size * cores_used;
562 
563    return compressed;
564 }
565