1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "etnaviv_context.h"
7 #include "etnaviv_debug.h"
8 #include "etnaviv_ml_nn.h"
9 #include "etnaviv_screen.h"
10
11 static void *
map_resource(struct pipe_resource * resource)12 map_resource(struct pipe_resource *resource)
13 {
14 return etna_bo_map(etna_resource(resource)->bo);
15 }
16
17 #define MAX_TILE_WIDTH 64
18
19 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)20 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
21 {
22 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
23 unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
24 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
25 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
26 unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
27
28 if (operation->weight_width == 1)
29 foo = MIN2(foo, nn_accum_buffer_depth / 3);
30
31 foo = MIN2(foo, kernels_per_core);
32 foo = MIN2(foo, 127);
33
34 kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
35 unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
36 unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
37
38 return superblocks;
39 }
40
41 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)42 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
43 {
44 unsigned mode = 8;
45
46 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
47 return 1;
48
49 if (tile_width > MAX_TILE_WIDTH / 2)
50 mode = 1;
51 else if (tile_width > MAX_TILE_WIDTH / 4)
52 mode = 2;
53 else if (tile_width > MAX_TILE_WIDTH / 8)
54 mode = 4;
55
56 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
57 return MIN2(mode, 4);
58
59 return MIN2(mode, 2);
60 }
61
62 unsigned
etna_ml_calculate_tiling_v7(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)63 etna_ml_calculate_tiling_v7(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
64 {
65 unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
66 unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
67 unsigned input_width = operation->input_width;
68 unsigned input_height = operation->input_height;
69 unsigned input_channels = operation->input_channels;
70 unsigned output_width = operation->output_width;
71 unsigned output_height = operation->output_height;
72 unsigned output_channels = operation->output_channels;
73 unsigned tile_width;
74 unsigned tile_height;
75 unsigned superblocks;
76 unsigned interleave_mode;
77
78 if (operation->addition)
79 etna_ml_calc_addition_sizes(&input_width, &input_height, &input_channels,
80 &output_width, &output_height, &output_channels);
81
82 if (operation->pooling_first_pixel) {
83 output_width *= 2;
84 output_height *= 2;
85 }
86
87 tile_width = MIN2(output_width, 64);
88 interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
89
90 tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
91 tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
92 tile_height = MIN2(tile_height, output_height);
93
94 if (operation->stride > 1 && tile_height % 2 > 0)
95 tile_height -= 1;
96
97 tile_height = MAX2(tile_height, 1);
98 superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
99
100 if (tile_width_out)
101 *tile_width_out = tile_width;
102
103 if (tile_height_out)
104 *tile_height_out = tile_height;
105
106 return superblocks;
107 }
108
109 static uint32_t
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)110 calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
111 {
112 int32_t correction = 0;
113
114 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
115 correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
116 }
117
118 return correction;
119 }
120
121
122 static void
append_bits(uint32_t value,size_t size,unsigned * bits_in_buffer,uint64_t * buffer,uint32_t ** dest,bool do_write)123 append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
124 {
125 *buffer |= (uint64_t)value << *bits_in_buffer;
126 *bits_in_buffer += size;
127 if (*bits_in_buffer >= 32) {
128 if (do_write)
129 **dest = *buffer & 0xffffffff;
130 *dest += 1;
131 *buffer >>= 32;
132 *bits_in_buffer -= 32;
133 }
134 }
135
136 struct wb_stream {
137 unsigned zero_point;
138 unsigned zrl_bits;
139 unsigned *bits_in_buffer;
140 uint64_t *buffer;
141 uint32_t **map;
142 bool do_write;
143
144 unsigned accum_zeroes;
145 };
146
147 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)148 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
149 {
150 if (wb_stream->accum_zeroes == 0)
151 return;
152
153 append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
154 wb_stream->accum_zeroes = 0;
155 append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
156 }
157
158 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)159 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
160 {
161 unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
162
163 if (wb_stream->zrl_bits == 0) {
164 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
165 return;
166 }
167
168 if (wb_stream->accum_zeroes == max_zeroes) {
169 append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
170 wb_stream->accum_zeroes = 0;
171 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
172 return;
173 }
174
175 if (value == wb_stream->zero_point) {
176 wb_stream->accum_zeroes++;
177 return;
178 }
179
180 append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
181 wb_stream->accum_zeroes = 0;
182 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
183 }
184
185 static unsigned
write_core_6(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)186 write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
187 {
188 struct pipe_context *pctx = subgraph->base.context;
189 unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
190 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
191 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
192 unsigned cores_used = MIN2(output_channels, nn_core_count);
193 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
194 uint8_t *input = map_resource(operation->weight_tensor);
195 uint32_t *biases = map_resource(operation->bias_tensor);
196 unsigned out_values_per_channel = operation->output_width * operation->output_height;
197 unsigned stride = MIN2(input_channels, 6);
198 unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
199 uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
200 uint32_t *initial_ptr = map;
201 bool do_write = initial_ptr != NULL;
202 uint64_t buffer = 0;
203 unsigned bits_in_buffer = 0;
204 struct wb_stream wb_stream = {
205 .zero_point = operation->weight_zero_point,
206 .zrl_bits = zrl_bits,
207 .bits_in_buffer = &bits_in_buffer,
208 .buffer = &buffer,
209 .map = &map,
210 .do_write = do_write,
211 };
212
213 ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
214
215 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
216 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
217
218 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
219
220 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
221 if (superblock == superblocks - 1)
222 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
223
224 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
225 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
226 weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
227 }
228
229 for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
230 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
231 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
232
233 if (block == 0) {
234 wb_stream_write(&wb_stream, weights_maps[kernel][0]);
235
236 uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
237 wb_stream_flush_zeroes(&wb_stream);
238 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
239
240 for (int i = 1; i < stride; i++) {
241 wb_stream_write(&wb_stream, weights_maps[kernel][i]);
242 }
243 } else {
244 for (int i = 0; i < stride; i++) {
245 if (i + block * stride < input_channels)
246 wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
247 }
248 }
249 if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
250 wb_stream_flush_zeroes(&wb_stream);
251 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
252 }
253 }
254 }
255 }
256
257 wb_stream_flush_zeroes(&wb_stream);
258
259 if (bits_in_buffer > 0)
260 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
261
262 return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
263 }
264
265 static unsigned
write_core_interleaved(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)266 write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
267 {
268 struct pipe_context *pctx = subgraph->base.context;
269 unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
270 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
271 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
272 unsigned cores_used = MIN2(output_channels, nn_core_count);
273 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
274 uint8_t *input = map_resource(operation->weight_tensor);
275 uint32_t *biases = map_resource(operation->bias_tensor);
276 unsigned out_values_per_channel = operation->output_width * operation->output_height;
277 unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
278 uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
279 uint32_t *initial_ptr = map;
280 bool do_write = initial_ptr != NULL;
281 uint64_t buffer = 0;
282 unsigned bits_in_buffer = 0;
283 struct wb_stream wb_stream = {
284 .zero_point = operation->weight_zero_point,
285 .zrl_bits = zrl_bits,
286 .bits_in_buffer = &bits_in_buffer,
287 .buffer = &buffer,
288 .map = &map,
289 .do_write = do_write,
290 };
291
292 ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
293
294 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
295 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
296
297 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
298
299 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
300 if (superblock == superblocks - 1)
301 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
302
303 for (unsigned z = 0; z < input_channels; z++) {
304 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
305 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
306
307 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
308 unsigned stride = operation->weight_height;
309 if (operation->weight_height > 3)
310 stride = 3;
311 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
312 if (x >= operation->weight_width)
313 break;
314 for (unsigned y = 0; y < stride; y++) {
315 wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
316 if (x == 0 && y == 0 && z == 0) {
317 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
318 wb_stream_flush_zeroes(&wb_stream);
319 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
320 }
321 }
322 }
323 if (operation->weight_height > 3) {
324 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
325 if (x >= operation->weight_width)
326 break;
327 for (unsigned y = stride; y < operation->weight_width; y++) {
328 wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
329 }
330 }
331 }
332 }
333
334 if (z == input_channels - 1) {
335 wb_stream_flush_zeroes(&wb_stream);
336 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
337 }
338 }
339 if (superblock == superblocks - 1)
340 wb_stream_flush_zeroes(&wb_stream);
341 }
342 }
343
344 wb_stream_flush_zeroes(&wb_stream);
345
346 if (bits_in_buffer > 0)
347 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
348
349 return (uint8_t *)map - (uint8_t *)initial_ptr;
350 }
351
352 static unsigned
write_core_sequential(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)353 write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
354 {
355 struct pipe_context *pctx = subgraph->base.context;
356 unsigned nn_core_count = etna_ml_get_core_info(etna_context(pctx))->nn_core_count;
357 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
358 unsigned cores_used = MIN2(output_channels, nn_core_count);
359 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
360 uint8_t *input = map_resource(operation->weight_tensor);
361 uint32_t *biases = map_resource(operation->bias_tensor);
362 unsigned out_values_per_channel = operation->output_width * operation->output_height;
363 unsigned superblocks = etna_ml_calculate_tiling_v7(etna_context(pctx), operation, NULL, NULL);
364 uint32_t *initial_ptr = map;
365 bool do_write = initial_ptr != NULL;
366 uint64_t buffer = 0;
367 unsigned bits_in_buffer = 0;
368 struct wb_stream wb_stream = {
369 .zero_point = operation->weight_zero_point,
370 .zrl_bits = zrl_bits,
371 .bits_in_buffer = &bits_in_buffer,
372 .buffer = &buffer,
373 .map = &map,
374 .do_write = do_write,
375 };
376
377 ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
378
379 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
380 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
381
382 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
383
384 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
385 if (superblock == superblocks - 1)
386 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
387
388 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
389 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
390
391 uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
392
393 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
394 unsigned stride = operation->weight_height;
395 if ((operation->depthwise || operation->input_width > 64) && \
396 operation->weight_height > 3)
397 stride = 3;
398 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
399 if (x >= operation->weight_width)
400 break;
401 for (unsigned y = 0; y < stride; y++) {
402
403 wb_stream_write(&wb_stream, weights_map[x][y]);
404 if (x == 0 && y == 0) {
405 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
406 wb_stream_flush_zeroes(&wb_stream);
407 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
408 }
409 }
410 }
411 if ((operation->depthwise || operation->input_width > 64) && \
412 operation->weight_height > 3) {
413 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
414 if (x >= operation->weight_width)
415 break;
416 for (unsigned y = stride; y < operation->weight_width; y++) {
417 wb_stream_write(&wb_stream, weights_map[x][y]);
418 }
419 }
420 }
421 }
422 wb_stream_flush_zeroes(&wb_stream);
423 if (operation->addition)
424 append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
425 else
426 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
427 }
428 }
429
430 wb_stream_flush_zeroes(&wb_stream);
431
432 if (bits_in_buffer > 0)
433 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
434
435 return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
436 }
437
438 static unsigned
calculate_weight_bo_size(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)439 calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
440 {
441 struct pipe_context *context = subgraph->base.context;
442 struct etna_context *ctx = etna_context(context);
443 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
444 unsigned header_size = ALIGN(nn_core_count * 4, 64);
445 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
446 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
447 unsigned cores_used = MIN2(output_channels, nn_core_count);
448 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
449 unsigned weights_size;
450 unsigned core_size;
451 unsigned core_size_aligned;
452 unsigned compressed_size_aligned;
453
454 weights_size = operation->weight_width * operation->weight_height * input_channels;
455 core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
456 core_size_aligned = ALIGN(core_size, 64);
457 compressed_size_aligned = header_size + core_size_aligned * cores_used;
458
459 return compressed_size_aligned;
460 }
461
462 static unsigned
calculate_zrl_bits(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)463 calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
464 {
465 struct pipe_context *context = subgraph->base.context;
466 struct etna_context *ctx = etna_context(context);
467 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
468 unsigned max_zrl_bits = etna_ml_get_core_info(ctx)->nn_zrl_bits;
469 unsigned header_size = ALIGN(nn_core_count * 4, 64);
470 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
471 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
472 unsigned cores_used = MIN2(output_channels, nn_core_count);
473 unsigned best_compressed_size;
474 unsigned best_zrl_bits;
475
476 /* These are very unlikely to have enough zeroes for compression to be useful. */
477 if (operation->addition ||
478 operation->pointwise) {
479
480 return 0;
481 }
482
483 /* This calculation can be really slow. Start from max_zrl_bits as big
484 * buffers will benefit the most from high zero compression.
485 */
486 best_compressed_size = UINT_MAX;
487 best_zrl_bits = 0;
488 for (int zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
489
490 unsigned compressed_size = header_size;
491 for (unsigned core = 0; core < cores_used; core++) {
492
493 unsigned actual_size;
494 if (operation->pointwise && output_channels > 8)
495 actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
496 else if (input_channels > 1)
497 actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
498 else
499 actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
500
501 compressed_size += actual_size;
502 }
503
504 /* If more bits don't compress further, then stop */
505 if (compressed_size <= best_compressed_size) {
506 best_compressed_size = compressed_size;
507 best_zrl_bits = zrl_bits;
508 } else
509 break;
510 }
511
512 return best_zrl_bits;
513 }
514
515 struct etna_bo *
etna_ml_create_coeffs_v7(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)516 etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
517 {
518 struct pipe_context *context = subgraph->base.context;
519 struct etna_context *ctx = etna_context(context);
520 unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
521 unsigned header_size = ALIGN(nn_core_count * 4, 64);
522 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
523 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
524 unsigned cores_used = MIN2(output_channels, nn_core_count);
525 unsigned zrl_bits;
526 unsigned max_core_size = 0;
527 unsigned bo_size;
528
529 bo_size = calculate_weight_bo_size(subgraph, operation);
530 zrl_bits = calculate_zrl_bits(subgraph, operation);
531
532 struct etna_bo *compressed = etna_ml_create_bo(context, bo_size);
533
534 etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
535
536 uint32_t *map = etna_bo_map(compressed);
537
538 uint32_t *header = map;
539 map += header_size / 4;
540
541 for (unsigned core = 0; core < cores_used; core++) {
542
543 unsigned actual_size;
544 if (operation->pointwise && output_channels > 8)
545 actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
546 else if (input_channels > 1)
547 actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
548 else
549 actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);
550
551 actual_size = ALIGN(actual_size, 64);
552 max_core_size = MAX2(actual_size, max_core_size);
553
554 header[core] = actual_size;
555
556 map += actual_size / 4;
557 }
558
559 etna_bo_cpu_fini(compressed);
560
561 *cache_size = max_core_size * cores_used;
562
563 return compressed;
564 }
565