1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "pipe-loader/pipe_loader.h"
7 #include "pipe/p_context.h"
8 #include "pipe/p_screen.h"
9 #include "pipe/p_state.h"
10 #include "util/format/u_format.h"
11 #include "util/u_inlines.h"
12
13 #include "tensorflow/lite/c/common.h"
14 #include "tensorflow/lite/builtin_ops.h"
15 #include "tensorflow/lite/core/c/builtin_op_data.h"
16
17 /* TODO: Move to TfLiteAsyncKernel for zero-copy of buffers */
18
19 enum teflon_debug_flags {
20 TEFLON_DEBUG_VERBOSE = 1 << 1,
21 };
22
23 static const struct debug_named_value teflon_debug_flags[] = {
24 { "verbose", TEFLON_DEBUG_VERBOSE, "Verbose logging." },
25 DEBUG_NAMED_VALUE_END
26 };
27
28 DEBUG_GET_ONCE_FLAGS_OPTION(debug_teflon, "TEFLON_DEBUG", teflon_debug_flags, 0)
29
30 static inline void
teflon_debug(const char * format,...)31 teflon_debug(const char *format, ...)
32 {
33 if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
34 va_list ap;
35 va_start(ap, format);
36 _debug_vprintf(format, ap);
37 va_end(ap);
38 }
39 }
40
41 struct teflon_delegate
42 {
43 TfLiteDelegate base;
44 struct pipe_loader_device *dev;
45 struct pipe_context *context;
46 };
47
48 struct teflon_subgraph
49 {
50 struct pipe_ml_subgraph *base;
51
52 unsigned *input_tensors;
53 unsigned input_count;
54
55 unsigned *output_tensors;
56 unsigned output_count;
57 };
58
59 static struct pipe_resource *
create_resource(struct pipe_context * context,TfLiteTensor tensor)60 create_resource(struct pipe_context *context, TfLiteTensor tensor)
61 {
62 unsigned bytes;
63 unsigned size = 1;
64
65 for (int i = 0; i < tensor.dims->size; i++)
66 size *= tensor.dims->data[i];
67
68 switch(tensor.type) {
69 case kTfLiteInt8:
70 case kTfLiteUInt8:
71 bytes = 1;
72 break;
73 case kTfLiteInt16:
74 case kTfLiteUInt16:
75 case kTfLiteFloat16:
76 bytes = 2;
77 break;
78 case kTfLiteInt32:
79 case kTfLiteUInt32:
80 case kTfLiteFloat32:
81 bytes = 4;
82 break;
83 case kTfLiteInt64:
84 case kTfLiteUInt64:
85 case kTfLiteFloat64:
86 case kTfLiteComplex64:
87 bytes = 8;
88 break;
89 default:
90 unreachable("Unsupported TF type");
91 }
92
93 return pipe_buffer_create_with_data(context, 0, PIPE_USAGE_DEFAULT, size * bytes, tensor.data.data);
94 }
95
96 static void
fill_operation(struct teflon_delegate * delegate,TfLiteContext * tf_context,TfLiteNode * node,TfLiteRegistration * node_registration,struct pipe_ml_operation * operation,struct pipe_tensor * tensors)97 fill_operation(struct teflon_delegate *delegate, TfLiteContext *tf_context, TfLiteNode *node, TfLiteRegistration *node_registration, struct pipe_ml_operation *operation, struct pipe_tensor *tensors)
98 {
99 TfLiteConvParams* params = (TfLiteConvParams*)node->builtin_data;
100
101 operation->input_tensor = &tensors[node->inputs->data[0]];
102 operation->output_tensor = &tensors[node->outputs->data[0]];
103
104 switch(node_registration->builtin_code) {
105 case kTfLiteBuiltinConv2d:
106 case kTfLiteBuiltinDepthwiseConv2d:
107 operation->type = PIPE_ML_OPERATION_TYPE_CONVOLUTION;
108 operation->conv.weight_tensor = &tensors[node->inputs->data[1]];
109 operation->conv.bias_tensor = &tensors[node->inputs->data[2]];
110 operation->conv.stride_x = params->stride_width;
111 operation->conv.stride_y = params->stride_height;
112 operation->conv.padding_same = params->padding == kTfLitePaddingSame;
113 operation->conv.depthwise = node_registration->builtin_code == kTfLiteBuiltinDepthwiseConv2d;
114 operation->conv.pointwise = operation->conv.weight_tensor->dims[1] == 1 && \
115 operation->conv.weight_tensor->dims[2] == 1;
116 break;
117 case kTfLiteBuiltinAveragePool2d:
118 operation->type = PIPE_ML_OPERATION_TYPE_POOLING;
119 break;
120 case kTfLiteBuiltinAdd:
121 operation->type = PIPE_ML_OPERATION_TYPE_ADD;
122 operation->add.input_tensor = &tensors[node->inputs->data[1]];
123 break;
124 default:
125 unreachable("Unsupported ML operation type");
126 }
127 }
128
129 static void
fill_tensor(struct teflon_delegate * delegate,TfLiteContext * tf_context,struct pipe_tensor * tensor,unsigned index)130 fill_tensor(struct teflon_delegate *delegate, TfLiteContext *tf_context, struct pipe_tensor *tensor, unsigned index)
131 {
132 struct pipe_context *context = delegate->context;
133 TfLiteTensor tf_tensor = tf_context->tensors[index];
134 const TfLiteAffineQuantization *quant = (const TfLiteAffineQuantization *)tf_tensor.quantization.params;
135
136 if (tf_tensor.type == kTfLiteNoType)
137 return; /* Placeholder tensor */
138
139 if (tf_tensor.data.data)
140 tensor->resource = create_resource(context, tf_tensor);
141
142 tensor->index = index;
143 memcpy(tensor->dims, tf_tensor.dims->data, tf_tensor.dims->size * sizeof(*tensor->dims));
144 tensor->scale = quant->scale->data[0];
145 tensor->zero_point = quant->zero_point->data[0];
146
147 switch(tf_tensor.type) {
148 case kTfLiteUInt8:
149 case kTfLiteUInt16:
150 case kTfLiteUInt32:
151 case kTfLiteUInt64:
152 tensor->is_signed = false;
153 break;
154 default:
155 tensor->is_signed = true;
156 }
157 }
158
159 static void
dump_graph(struct pipe_tensor * tensors,unsigned tensor_count,struct pipe_ml_operation * operations,unsigned operation_count)160 dump_graph(struct pipe_tensor *tensors, unsigned tensor_count, struct pipe_ml_operation *operations, unsigned operation_count)
161 {
162 teflon_debug("\n");
163 teflon_debug("teflon: compiling graph: %d tensors %d operations\n",
164 tensor_count, operation_count);
165
166 teflon_debug("%3s %-8s %3s %s %-12s\n", "idx", "scale", "zp", "has_data", "size");
167 teflon_debug("=======================================\n");
168 for (int i = 0; i < tensor_count; i++) {
169 teflon_debug("%3d %6f %3x %-8s %dx%dx%dx%d\n",
170 tensors[i].index,
171 tensors[i].scale,
172 tensors[i].zero_point,
173 tensors[i].resource == NULL ? "no" : "yes",
174 tensors[i].dims[0], tensors[i].dims[1], tensors[i].dims[2], tensors[i].dims[3]);
175 }
176
177 teflon_debug("\n");
178 teflon_debug("%3s %-6s %3s %3s %s\n", "idx", "type", "in", "out", "operation type-specific");
179 teflon_debug("================================================================================================\n");
180 for (int i = 0; i < operation_count; i++) {
181 switch(operations[i].type) {
182 case PIPE_ML_OPERATION_TYPE_ADD:
183 teflon_debug("%3d %-6s %3d %3d in: %d",
184 i,
185 "ADD",
186 operations[i].input_tensor->index,
187 operations[i].output_tensor->index,
188 operations[i].add.input_tensor->index);
189 break;
190 case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
191 teflon_debug("%3d %-6s %3d %3d w: %d b: %d stride: %d pad: %s",
192 i,
193 operations[i].conv.depthwise ? "DWCONV" : "CONV",
194 operations[i].input_tensor->index,
195 operations[i].output_tensor->index,
196 operations[i].conv.weight_tensor->index,
197 operations[i].conv.bias_tensor->index,
198 operations[i].conv.stride_x,
199 operations[i].conv.padding_same ? "SAME" : "VALID");
200 break;
201 case PIPE_ML_OPERATION_TYPE_POOLING:
202 teflon_debug("%3d %-6s %3d %3d filter: %dx%d stride: %d pad: %s",
203 i,
204 "POOL",
205 operations[i].input_tensor->index,
206 operations[i].output_tensor->index,
207 operations[i].pooling.filter_height,
208 operations[i].pooling.filter_width,
209 operations[i].pooling.stride_x,
210 operations[i].pooling.padding_same ? "SAME" : "VALID");
211 break;
212 }
213
214 teflon_debug("\n");
215 }
216 teflon_debug("\n");
217 }
218
219 static void *
partition_init(TfLiteContext * tf_context,const char * buffer,size_t length)220 partition_init(TfLiteContext *tf_context, const char *buffer, size_t length)
221 {
222 const TfLiteDelegateParams *params = (const TfLiteDelegateParams *)buffer;
223 struct teflon_delegate *delegate = (struct teflon_delegate *)params->delegate;
224 struct pipe_context *context = delegate->context;
225 struct pipe_ml_operation operations[params->nodes_to_replace->size];
226 struct pipe_tensor tensors[tf_context->tensors_size];
227 long start = 0, end = 0;
228
229 memset(operations, 0, sizeof(operations));
230 memset(tensors, 0, sizeof(tensors));
231
232 if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
233 struct timespec time;
234 clock_gettime(CLOCK_MONOTONIC, &time);
235 start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
236 }
237
238 for (int i = 0; i < tf_context->tensors_size; i++)
239 fill_tensor(delegate, tf_context, &tensors[i], i);
240
241 for (int i = 0; i < params->nodes_to_replace->size; i++)
242 {
243 const int node_index = params->nodes_to_replace->data[i];
244 TfLiteNode *delegated_node = NULL;
245 TfLiteRegistration *delegated_node_registration = NULL;
246 tf_context->GetNodeAndRegistration(tf_context, node_index, &delegated_node,
247 &delegated_node_registration);
248
249 fill_operation(delegate, tf_context, delegated_node, delegated_node_registration, &operations[i], tensors);
250 }
251
252 if (debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)
253 dump_graph(tensors, tf_context->tensors_size, operations, params->nodes_to_replace->size);
254
255 struct pipe_ml_subgraph *subgraph;
256 subgraph = context->ml_subgraph_create(context,
257 operations,
258 params->nodes_to_replace->size);
259
260 for (int i = 0; i < tf_context->tensors_size; i++)
261 pipe_resource_reference(&tensors[i].resource, NULL);
262
263 struct teflon_subgraph *tsubgraph = calloc(1, sizeof(*tsubgraph));
264 tsubgraph->base = subgraph;
265
266 tsubgraph->input_tensors = malloc(params->input_tensors->size * sizeof(*tsubgraph->input_tensors));
267 for (int i = 0; i < params->input_tensors->size; i++) {
268 unsigned tensor_idx = params->input_tensors->data[i];
269 TfLiteTensor *tensor = &tf_context->tensors[tensor_idx];
270 if (tensor->allocation_type == kTfLiteMmapRo)
271 continue;
272 tsubgraph->input_tensors[tsubgraph->input_count] = tensor_idx;
273 tsubgraph->input_count++;
274 }
275
276 tsubgraph->output_count = params->output_tensors->size;
277 tsubgraph->output_tensors = malloc(params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
278 memcpy(tsubgraph->output_tensors, params->output_tensors->data,
279 params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
280
281 if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
282 struct timespec time;
283 clock_gettime(CLOCK_MONOTONIC, &time);
284 end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
285 teflon_debug("teflon: compiled graph, took %ld ms\n", (end - start));
286 }
287
288 return tsubgraph;
289 }
290
291 static TfLiteStatus
partition_prepare(TfLiteContext * context,TfLiteNode * node)292 partition_prepare(TfLiteContext *context, TfLiteNode *node)
293 {
294 // TODO: If input size has changed, resize input, intermediate and output buffers
295
296 return kTfLiteOk;
297 }
298
299 // De-allocates the per-node-and-Interpreter custom data.
300 static void
partition_free(TfLiteContext * tf_context,void * buffer)301 partition_free(TfLiteContext *tf_context, void *buffer)
302 {
303 struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)buffer;
304 struct pipe_ml_subgraph *subgraph = tsubgraph->base;
305 struct pipe_context *context = subgraph->context;
306
307 context->ml_subgraph_destroy(context, subgraph);
308 free(tsubgraph->input_tensors);
309 free(tsubgraph->output_tensors);
310 free(tsubgraph);
311 }
312
313 static TfLiteStatus
partition_invoke(TfLiteContext * tf_context,TfLiteNode * node)314 partition_invoke(TfLiteContext *tf_context, TfLiteNode *node)
315 {
316 struct teflon_delegate *delegate = (struct teflon_delegate *)node->delegate;
317 struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)node->user_data;
318 struct pipe_ml_subgraph *subgraph = tsubgraph->base;
319 struct pipe_context *context = delegate->context;
320 long start = 0, end = 0;
321
322 if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
323 struct timespec time;
324 clock_gettime(CLOCK_MONOTONIC, &time);
325 start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
326 }
327
328 struct pipe_tensor input = {0};
329 /* FIXME: Support mutiple inputs */
330 fill_tensor(delegate, tf_context, &input, tsubgraph->input_tensors[0]);
331 context->ml_subgraph_invoke(context, subgraph, &input);
332
333 void **buffers = malloc(tsubgraph->output_count * sizeof(*buffers));
334 for (unsigned i = 0; i < tsubgraph->output_count; i++)
335 buffers[i] = tf_context->tensors[tsubgraph->output_tensors[i]].data.data;
336 context->ml_subgraph_read_output(context, subgraph, tsubgraph->output_count, tsubgraph->output_tensors, buffers);
337 free(buffers);
338
339 pipe_resource_reference(&input.resource, NULL);
340
341 if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
342 struct timespec time;
343 clock_gettime(CLOCK_MONOTONIC, &time);
344 end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
345 teflon_debug("teflon: invoked graph, took %ld ms\n", (end - start));
346 }
347
348 return kTfLiteOk;
349 }
350
351 static TfLiteStatus
PrepareDelegate(TfLiteContext * context,TfLiteDelegate * delegate)352 PrepareDelegate(TfLiteContext *context, TfLiteDelegate *delegate)
353 {
354 TfLiteIntArray *plan;
355 TfLiteNode *node;
356 TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
357
358 // Get a list of supported nodes.
359 TfLiteIntArray *supported_nodes = malloc(plan->size * sizeof(int) + sizeof(*supported_nodes));
360 supported_nodes->size = plan->size;
361 unsigned node_count = 0;
362 for (int i = 0; i < plan->size; i++) {
363 int node_index = plan->data[i];
364 bool supported = false;
365 TfLiteRegistration *registration;
366 TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
367 context, node_index, &node, ®istration));
368
369 switch(registration->builtin_code) {
370 case kTfLiteBuiltinConv2d:
371 case kTfLiteBuiltinDepthwiseConv2d: {
372 TfLiteTensor bias_tensor = context->tensors[node->inputs->data[2]];
373 /* Skip out channel numbers that the HW doesn't support */
374 if (bias_tensor.dims->data[0] > 8 && bias_tensor.dims->data[0] % 8 != 0)
375 supported = false;
376 else
377 supported = true;
378 break;
379 }
380 case kTfLiteBuiltinAdd:
381 supported = true;
382 break;
383 }
384
385 if (supported)
386 supported_nodes->data[node_count++] = node_index;
387 }
388 supported_nodes->size = node_count;
389
390 TfLiteRegistration registration;
391
392 registration.init = partition_init;
393 registration.free = partition_free;
394 registration.prepare = partition_prepare;
395 registration.invoke = partition_invoke;
396
397 registration.profiling_string = NULL;
398 registration.builtin_code = kTfLiteBuiltinDelegate;
399 registration.version = 1;
400 registration.registration_external = NULL;
401 registration.custom_name = "Teflon Delegate";
402
403 // Replace supported subgraphs.
404 TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
405 context,
406 registration,
407 supported_nodes,
408 delegate);
409
410 free(supported_nodes);
411
412 return status;
413 }
414
415 static TfLiteStatus
CopyFromBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)416 CopyFromBufferHandle(TfLiteContext *context,
417 TfLiteDelegate *delegate,
418 TfLiteBufferHandle buffer_handle,
419 TfLiteTensor *tensor)
420 {
421 return kTfLiteOk;
422 }
423
424 static void
FreeBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle * handle)425 FreeBufferHandle(TfLiteContext *context,
426 TfLiteDelegate *delegate,
427 TfLiteBufferHandle *handle)
428 {
429 }
430
431 TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
432 char **options_values,
433 size_t num_options,
434 void (*report_error)(const char *));
435
436 void tflite_plugin_destroy_delegate(TfLiteDelegate *delegate);
437
tflite_plugin_create_delegate(char ** options_keys,char ** options_values,size_t num_options,void (* report_error)(const char *))438 __attribute__((visibility("default"))) TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
439 char **options_values,
440 size_t num_options,
441 void (*report_error)(const char *))
442 {
443 struct teflon_delegate *delegate = (struct teflon_delegate *)calloc(1, sizeof(*delegate));
444 struct pipe_screen *screen;
445 struct pipe_loader_device **devs;
446
447 delegate->base.flags = kTfLiteDelegateFlagsAllowDynamicTensors | kTfLiteDelegateFlagsRequirePropagatedShapes;
448 delegate->base.Prepare = &PrepareDelegate;
449 delegate->base.CopyFromBufferHandle = &CopyFromBufferHandle;
450 delegate->base.FreeBufferHandle = &FreeBufferHandle;
451
452 int n = pipe_loader_probe(NULL, 0, false);
453 devs = (struct pipe_loader_device **)malloc(sizeof(*devs) * n);
454 pipe_loader_probe(devs, n, false);
455
456 for (int i = 0; i < n; i++) {
457 if (strstr("etnaviv", devs[i]->driver_name))
458 delegate->dev = devs[i];
459 else
460 pipe_loader_release(&devs[i], 1);
461 }
462 free(devs);
463
464 if (delegate->dev == NULL) {
465 fprintf(stderr, "Couldn't open kernel device\n");
466 return NULL;
467 }
468
469 teflon_debug("Teflon delegate: loaded %s driver\n", delegate->dev->driver_name);
470
471 screen = pipe_loader_create_screen(delegate->dev);
472 delegate->context = screen->context_create(screen, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
473
474 return &delegate->base;
475 }
476
tflite_plugin_destroy_delegate(TfLiteDelegate * tflite_delegate)477 __attribute__((visibility("default"))) void tflite_plugin_destroy_delegate(TfLiteDelegate *tflite_delegate)
478 {
479 struct teflon_delegate *delegate = (struct teflon_delegate *)tflite_delegate;
480 struct pipe_screen *screen;
481
482 if (tflite_delegate == NULL) {
483 fprintf(stderr, "tflite_plugin_destroy_delegate: NULL delegate!\n");
484 return;
485 }
486
487 screen = delegate->context->screen;
488 delegate->context->destroy(delegate->context);
489 screen->destroy(screen);
490 pipe_loader_release(&delegate->dev, 1);
491 free(delegate);
492 }
493