1// Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// WARNING: Until b/191428000 is fixed you need to manually generate and update 16// the generated flatbuffer code when modifying this file. See BUILD for more 17// information. 18 19// This schema defines how to configure TFLite for delegation. These 20// definitions can be used in multiple ways: as output of a compatibility list, 21// in benchmarking tools and to decouple delegate instantiation from code. 22// 23// The schema is work-in-progress, covering the most broadly used delegates and 24// options. 25 26syntax = "proto2"; 27 28package tflite.proto; 29 30// ExecutionPreference is used to match accelerators against the preferences of 31// the current application or usecase. Some of the values here can appear both 32// in the compatibility list and as input, some only as input. 33// 34// These are separate from NNAPIExecutionPreference - the compatibility list 35// design doesn't assume a one-to-one mapping between which usecases 36// compatibility list entries have been developed for and what settings are used 37// for NNAPI. 38enum ExecutionPreference { 39 // Match any selected preference. Allowlist (semantically - value is same as 40 // on input). 41 ANY = 0; 42 // Match low latency preference. Both compatibility list and input. 43 LOW_LATENCY = 1; 44 // Math low power preference. Both compatibility list and input. 45 LOW_POWER = 2; 46 // Never accelerate. Can be used for input to compatibility list or for 47 // standalone Acceleration configuration. 48 FORCE_CPU = 3; 49} 50 51// TFLite accelerator to use. 52enum Delegate { 53 NONE = 0; 54 55 NNAPI = 1; 56 GPU = 2; 57 HEXAGON = 3; 58 XNNPACK = 4; 59 // The EdgeTpu in Pixel devices. 60 EDGETPU = 5; 61 // The Coral EdgeTpu Dev Board / USB accelerator. 62 EDGETPU_CORAL = 6; 63} 64 65enum NNAPIExecutionPreference { 66 // Undefined. 67 UNDEFINED = 0; 68 // Prefer executing in a way that minimizes battery drain. 69 NNAPI_LOW_POWER = 1; 70 // Prefer returning a single answer as fast as possible, even if this causes 71 // more power consumption. 72 NNAPI_FAST_SINGLE_ANSWER = 2; 73 // Prefer maximizing the throughput of successive frames, for example when 74 // processing successive frames coming from the camera. 75 NNAPI_SUSTAINED_SPEED = 3; 76} 77 78enum NNAPIExecutionPriority { 79 NNAPI_PRIORITY_UNDEFINED = 0; 80 NNAPI_PRIORITY_LOW = 1; 81 NNAPI_PRIORITY_MEDIUM = 2; 82 NNAPI_PRIORITY_HIGH = 3; 83} 84 85// One possible acceleration configuration. 86message ComputeSettings { 87 // Which preference to use this accelerator for. 88 optional ExecutionPreference preference = 1; 89 // How to configure TFLite 90 optional TFLiteSettings tflite_settings = 2; 91 // Identifiers to use for instrumentation and telemetry. 92 optional string model_namespace_for_statistics = 3; 93 optional string model_identifier_for_statistics = 4; 94 95 // 'Maybe' acceleration: use mini-benchmark to select settings. 96 optional MinibenchmarkSettings settings_to_test_locally = 5; 97} 98 99// NNAPI delegate settings. 100message NNAPISettings { 101 // Which instance (NNAPI accelerator) to use. One driver may provide several 102 // accelerators (though a driver may also hide several back-ends behind one 103 // name, at the choice of the driver vendor). 104 // Note that driver introspection is only available in Android Q and later. 105 optional string accelerator_name = 1; 106 107 // NNAPI model compilation caching settings to be passed to 108 // tflite::StatefulNnApiDelegate 109 optional string cache_directory = 2; 110 optional string model_token = 3; 111 112 // NNAPI execution preference to pass. See 113 // https://developer.android.com/ndk/reference/group/neural-networks.html 114 optional NNAPIExecutionPreference execution_preference = 4; 115 116 // Number of instances to cache for the same model (for input size 117 // changes). This is mandatory for getting reasonable performance in that 118 // case. 119 optional int32 no_of_nnapi_instances_to_cache = 5; 120 121 // Deprecated; use the fallback_settings in TFLiteSettings. 122 // 123 // Whether to automatically fall back to TFLite CPU path. 124 optional FallbackSettings fallback_settings = 6 [deprecated = true]; 125 126 // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android 127 // 10+ when an accelerator name is not specified. The NNAPI CPU typically 128 // performs less well than the TfLite built-in kernels; but allowing allows a 129 // model to be partially accelerated which may be a win. 130 optional bool allow_nnapi_cpu_on_android_10_plus = 7; 131 132 optional NNAPIExecutionPriority execution_priority = 8; 133 134 // Whether to allow dynamic dimension sizes without re-compilation. 135 // A tensor of with dynamic dimension must have a valid dims_signature 136 // defined. 137 // Only supported in NNAPI 1.1 and newer versions. 138 // WARNING: Setting this flag to true may result in model being rejected by 139 // accelerator. This should only be enabled if the target device supports 140 // dynamic dimensions of the model. 141 // By default this is set to false. 142 optional bool allow_dynamic_dimensions = 9; 143 144 // Whether to allow the NNAPI accelerator to optionally use lower-precision 145 // float16 (16-bit floating point) arithmetic when doing calculations on 146 // float32 (32-bit floating point). 147 optional bool allow_fp16_precision_for_fp32 = 10; 148 149 // Whether to use NNAPI Burst mode. 150 // Burst mode allows accelerators to efficiently manage resources, which 151 // would significantly reduce overhead especially if the same delegate 152 // instance is to be used for multiple inferences. 153 optional bool use_burst_computation = 11; 154} 155 156// Which GPU backend to select. Default behaviour on Android is to try OpenCL 157// and if it's not available fall back to OpenGL. 158enum GPUBackend { 159 UNSET = 0; 160 OPENCL = 1; 161 OPENGL = 2; 162 // Not yet supported. 163 // VULKAN = 3; 164 // METAL = 4; 165} 166 167// GPU inference priorities define relative priorities given by the GPU delegate 168// to different client needs. 169// Corresponds to TfLiteGpuInferencePriority. 170enum GPUInferencePriority { 171 GPU_PRIORITY_AUTO = 0; 172 GPU_PRIORITY_MAX_PRECISION = 1; 173 GPU_PRIORITY_MIN_LATENCY = 2; 174 GPU_PRIORITY_MIN_MEMORY_USAGE = 3; 175} 176 177// GPU inference preference for initialization time vs. inference time. 178// Corresponds to TfLiteGpuInferenceUsage. 179enum GPUInferenceUsage { 180 // Delegate will be used only once, therefore, bootstrap/init time should 181 // be taken into account. 182 GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0; 183 184 // Prefer maximizing the throughput. Same delegate will be used repeatedly on 185 // multiple inputs. 186 GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1; 187} 188 189// GPU Delegate settings. 190// 191// See 192// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h 193message GPUSettings { 194 // Ignored if inference_priority1/2/3 are set. 195 optional bool is_precision_loss_allowed = 1; 196 optional bool enable_quantized_inference = 2 [default = true]; 197 optional GPUBackend force_backend = 3; 198 199 // Ordered priorities provide better control over desired semantics, 200 // where priority(n) is more important than priority(n+1). Therefore, 201 // each time inference engine needs to make a decision, it uses 202 // ordered priorities to do so. 203 // 204 // Default values correspond to GPU_PRIORITY_AUTO. 205 // AUTO priority can only be used when higher priorities are fully specified. 206 // For example: 207 // VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO 208 // VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION, 209 // priority3 = AUTO 210 // INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO 211 // INVALID: priority1 = MIN_LATENCY, priority2 = AUTO, 212 // priority3 = MAX_PRECISION 213 // Invalid priorities will result in error. 214 // 215 // For more information, see TfLiteGpuDelegateOptionsV2. 216 optional GPUInferencePriority inference_priority1 = 4 217 [default = GPU_PRIORITY_AUTO]; 218 optional GPUInferencePriority inference_priority2 = 5 219 [default = GPU_PRIORITY_AUTO]; 220 optional GPUInferencePriority inference_priority3 = 6 221 [default = GPU_PRIORITY_AUTO]; 222 223 // Whether to optimize for compilation+execution time or execution time only. 224 optional GPUInferenceUsage inference_preference = 7; 225 226 // Model serialization. Setting both of these fields will also set the 227 // TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION flag on the delegate. 228 // 229 // GPU model serialization directory passed in TfLiteGpuDelegateOptionsV2. 230 // This should be set to the application's code cache directory so that it can 231 // not be accessed by other apps and is correctly deleted on app updates. 232 // tflite::StatefulNnApiDelegate 233 optional string cache_directory = 8; 234 // Normally, the model name with version number should be provided here, since 235 // each model needs an unique ID to avoid cache collision. 236 optional string model_token = 9; 237} 238 239// Hexagon Delegate settings. 240// 241// See 242// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h 243message HexagonSettings { 244 optional int32 debug_level = 1; 245 optional int32 powersave_level = 2; 246 optional bool print_graph_profile = 3; 247 optional bool print_graph_debug = 4; 248} 249 250// XNNPack Delegate settings. 251// 252// See 253// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h 254message XNNPackSettings { 255 optional int32 num_threads = 1; 256} 257 258// EdgeTPU device spec. 259// 260message EdgeTpuDeviceSpec { 261 // EdgeTPU platform types. 262 enum PlatformType { 263 MMIO = 0; 264 REFERENCE = 1; 265 SIMULATOR = 2; 266 REMOTE_SIMULATOR = 3; 267 } 268 269 // Execution platform for the EdgeTPU device. 270 optional PlatformType platform_type = 1; 271 272 // Number of chips to use for the EdgeTPU device. 273 optional int32 num_chips = 2; 274 275 // Paths to the EdgeTPU devices; 276 repeated string device_paths = 3; 277 278 // Chip family used by the EdgeTpu device. 279 optional int32 chip_family = 4; 280} 281 282// Generic definitions of EdgeTPU power states. 283enum EdgeTpuPowerState { 284 // Undefined power state. 285 UNDEFINED_POWERSTATE = 0; 286 287 // TPU core is off but control cluster is on. 288 TPU_CORE_OFF = 1; 289 290 // A non-active low-power state that has much smaller transition time to 291 // active compared to off. 292 READY = 2; 293 294 // Minimum power active state. 295 ACTIVE_MIN_POWER = 3; 296 297 // Very low performance, very low power. 298 ACTIVE_VERY_LOW_POWER = 4; 299 300 // Low performance, low power. 301 ACTIVE_LOW_POWER = 5; 302 303 // The normal performance and power. This setting usually provides the 304 // optimal perf/power trade-off for the average use-case. 305 ACTIVE = 6; 306 307 // Maximum performance level. Potentially higher power and thermal. This 308 // setting may not be allowed in production depending on the system. 309 OVER_DRIVE = 7; 310} 311 312message EdgeTpuInactivePowerConfig { 313 // Inactive power states between inferences. 314 optional EdgeTpuPowerState inactive_power_state = 1; 315 316 // Inactive timeout in microseconds between inferences. 317 optional int64 inactive_timeout_us = 2; 318} 319 320// EdgeTPU Delegate settings. 321// 322message EdgeTpuSettings { 323 // Float truncation types for EdgeTPU. 324 enum FloatTruncationType { 325 UNSPECIFIED = 0; 326 NO_TRUNCATION = 1; 327 BFLOAT16 = 2; 328 HALF = 3; 329 } 330 331 // Target inference power state for running the model. 332 optional EdgeTpuPowerState inference_power_state = 1; 333 334 // Inactive power states between inferences. 335 repeated EdgeTpuInactivePowerConfig inactive_power_configs = 2; 336 337 // Priority for the inference request. 338 optional int32 inference_priority = 3 [default = -1]; 339 340 // Device spec for creating the EdgeTpu device. 341 optional EdgeTpuDeviceSpec edgetpu_device_spec = 4; 342 343 // A unique identifier of the input TfLite model. 344 optional string model_token = 5; 345 346 // Float truncation type for EdgeTPU. 347 optional FloatTruncationType float_truncation_type = 6; 348} 349 350// Coral Dev Board / USB accelerator delegate settings. 351// 352// See 353// https://github.com/google-coral/edgetpu/blob/master/libedgetpu/edgetpu_c.h 354message CoralSettings { 355 enum Performance { 356 UNDEFINED = 0; 357 MAXIMUM = 1; 358 HIGH = 2; 359 MEDIUM = 3; 360 LOW = 4; 361 } 362 363 // The Edge Tpu device to be used. See 364 // https://github.com/google-coral/libcoral/blob/982426546dfa10128376d0c24fd8a8b161daac97/coral/tflite_utils.h#L131-L137 365 optional string device = 1; 366 // The desired performance level. This setting adjusts the internal clock 367 // rate to achieve different performance / power balance. Higher performance 368 // values improve speed, but increase power usage. 369 optional Performance performance = 2 [default = MAXIMUM]; 370 // If true, always perform device firmware update (DFU) after reset. DFU is 371 // usually only necessary after power cycle. 372 optional bool usb_always_dfu = 3; 373 // The maximum bulk in queue length. Larger queue length may improve USB 374 // performance on the direction from device to host. When not specified (or 375 // zero), `usb_max_bulk_in_queue_length` will default to 32 according to the 376 // current EdgeTpu Coral implementation. 377 optional int32 usb_max_bulk_in_queue_length = 4; 378} 379 380message CPUSettings { 381 // Set to -1 to let the interpreter choose. Otherwise, must be > 0. 382 optional int32 num_threads = 1 [default = -1]; 383} 384 385// How to configure TFLite. 386message TFLiteSettings { 387 // Which delegate to use. 388 optional Delegate delegate = 1; 389 390 // How to configure the chosen delegate. 391 // (In principle we would like to use 'oneof', but flatc turns that into an 392 // nested anonymous table rather than a union. See 393 // https://github.com/google/flatbuffers/issues/4628). 394 optional NNAPISettings nnapi_settings = 2; 395 optional GPUSettings gpu_settings = 3; 396 optional HexagonSettings hexagon_settings = 4; 397 optional XNNPackSettings xnnpack_settings = 5; 398 399 // How to configure CPU execution. 400 optional CPUSettings cpu_settings = 6; 401 402 // Shared delegation settings. 403 optional int32 max_delegated_partitions = 7; 404 405 // For configuring the EdgeTpuDelegate. 406 optional EdgeTpuSettings edgetpu_settings = 8; 407 408 // For configuring the Coral EdgeTpu Delegate. 409 optional CoralSettings coral_settings = 10; 410 411 // Whether to automatically fall back to TFLite CPU path. 412 optional FallbackSettings fallback_settings = 9; 413} 414 415// Whether to automatically fallback to TFLite CPU path on delegation errors. 416// 417// Typically fallback is enabled in production use but disabled in tests and 418// benchmarks to ensure they test the intended path. 419message FallbackSettings { 420 // Whether to allow automatically falling back to TfLite CPU path on 421 // compilation failure. Default is not allowing automatic fallback. 422 // 423 // This is useful in naive production usecases where the caller would prefer 424 // for the model to run even if it's not accelerated. More advanced users will 425 // implement fallback themselves; e.g., by using a different model on CPU. 426 // 427 // Note that compilation errors may occur either at initial 428 // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after 429 // resizing. 430 optional bool allow_automatic_fallback_on_compilation_error = 7; 431 // Whether to allow automatically falling back to TfLite CPU path on 432 // execution error. Default is not allowing automatic fallback. 433 // 434 // Experimental, use with care (only when you have complete control over the 435 // client code). 436 // 437 // The caveat above for compilation error holds. Additionally, execution-time 438 // errors are harder to handle automatically as they require invalidating the 439 // TfLite interpreter which most client code has not been designed to deal 440 // with. 441 optional bool allow_automatic_fallback_on_execution_error = 8; 442} 443 444// On-device mini-benchmark result storage. The following definitions are used 445// to keep an append-only log of benchmark results on-device. (Hence there is 446// single top-level event that is used for all data). 447// 448// These definitions don't need a proto-to-flatbuffer conversion, since they are 449// not used for specifying configuration in the Tasks library. 450 451// Which stage of benchmarking the event is for. 452// There might be multiple events with the same type, if a benchmark is run 453// multiple times. 454enum BenchmarkEventType { 455 UNDEFINED_BENCHMARK_EVENT_TYPE = 0; 456 // Benchmark start. A start without an end can be interpreted as a test that 457 // has crashed or hung. 458 START = 1; 459 // Benchmarking completion. A model was successfully loaded, acceleration 460 // configured and inference run without errors. There may still be an issue 461 // with correctness of results, or with performance. 462 END = 2; 463 // Benchmark was not completed due to an error. The error may be a handled 464 // error (e.g., failure in a delegate), or a crash. 465 ERROR = 3; 466 // Benchmark data has been sent for logging. 467 LOGGED = 4; 468 // Benchmark encountered an error but was able to continue. The error is not 469 // related to the model execution but to the mini-benchmark logic. An example 470 // of error is a failure when trying to set the CPU affinity of the benchmark 471 // runner process. 472 RECOVERED_ERROR = 5; 473} 474 475// A correctness metric from a benchmark, for example KL-divergence between 476// known-good CPU output and on-device output. These are primarily used for 477// telemetry and monitored server-side. 478message BenchmarkMetric { 479 optional string name = 1; 480 repeated float values = 2 [packed = true]; 481} 482 483// Outcome of a successfully complete benchmark run. This information is 484// intended to both be used on-device to select best compute configuration as 485// well as sent to server for monitoring. 486// 487// Used with event type END. 488message BenchmarkResult { 489 // Time to load model and apply acceleration. Initialization may get run 490 // multiple times to get information on variance. 491 repeated int64 initialization_time_us = 1 [packed = true]; 492 // Time to run inference (call Invoke()). Inference may get run multiple times 493 // to get information on variance. 494 repeated int64 inference_time_us = 2 [packed = true]; 495 // Maximum memory used. Measures size of application heap (does not 496 // necessarily take into account driver-side allocation. 497 optional int32 max_memory_kb = 3; 498 // Whether the inference produced correct results (validation graph output 499 // 'ok' for all test inputs). Used on-device to disallow configurations that 500 // produce incorrect results (e.g., due to OpenCL driver bugs). 501 optional bool ok = 4; 502 // Metrics that were used to determine the 'ok' status. 503 repeated BenchmarkMetric metrics = 5; 504} 505 506// A handled error. 507message ErrorCode { 508 // Which delegate the error comes from (or NONE, if it comes from the tflite 509 // framework). 510 optional Delegate source = 1; 511 // What the tflite level error is. 512 optional int32 tflite_error = 2; 513 // What the underlying error is (e.g., NNAPI or OpenGL error). 514 optional int64 underlying_api_error = 3; 515} 516 517// When during benchmark execution an error occurred. 518enum BenchmarkStage { 519 UNKNOWN = 0; 520 // During model loading or delegation. 521 INITIALIZATION = 1; 522 // During inference. 523 INFERENCE = 2; 524} 525 526// An error that occurred during benchmarking. 527// 528// Used with event type ERROR. 529message BenchmarkError { 530 // How far benchmarking got. 531 optional BenchmarkStage stage = 1; 532 // Process exit code. 533 optional int32 exit_code = 2; 534 // Signal the process received. 535 optional int32 signal = 3; 536 // Handled tflite error. 537 repeated ErrorCode error_code = 4; 538 // Mini-benchmark error code. 539 optional int32 mini_benchmark_error_code = 5; 540} 541 542// Top-level benchmarking event stored on-device. All events for a model are 543// parsed to detect the status. 544message BenchmarkEvent { 545 // Which settings were used for benchmarking. 546 optional TFLiteSettings tflite_settings = 1; 547 // Type of the event. 548 optional BenchmarkEventType event_type = 2; 549 // Result of benchmark, used when type is END. 550 optional BenchmarkResult result = 3; 551 // Error during benchmark, used when type is ERROR. 552 optional BenchmarkError error = 4; 553 // Start timestamps. These are used for 554 // 1. Checking whether a test was started but not completed within a given 555 // deadline. 556 // 2. Optionally, telemetry timestamps. 557 optional int64 boottime_us = 5; 558 optional int64 wallclock_us = 6; 559} 560 561// Represent the decision on the best acceleration from the mini-benchmark. 562message BestAccelerationDecision { 563 // Number of events used to take the decision. 564 // Using just the size instaed of the full list of events to save space. 565 optional int32 number_of_source_events = 1; 566 567 // Event with min latency in the source ones. 568 optional BenchmarkEvent min_latency_event = 2; 569 570 // Min latency as read from min_latency_event. 571 optional int64 min_inference_time_us = 3; 572} 573 574// Represent a failure during the initialization of the mini-benchmark. 575message BenchmarkInitializationFailure { 576 // Status code returned by the mini-benchmark initialization function. 577 optional int32 initialization_status = 1; 578} 579 580// Events generated by the mini-benchmark before and after triggering 581// the different configuration-specific benchmarks 582message MiniBenchmarkEvent { 583 // Not using oneof because of the way the generated cpp code. 584 // See comment above on TfLite settings for details. 585 586 // If set to true, this event is used to mark all previous events in the 587 // mini-benchmark internal storage as read and one of the other fields 588 // in this message will have a value. 589 optional bool is_log_flushing_event = 1; 590 // Event generated when a best acceleration decision is taken. 591 optional BestAccelerationDecision best_acceleration_decision = 2; 592 // Reports a failure during mini-benchmark initialization. 593 optional BenchmarkInitializationFailure initialization_failure = 3; 594 // Event generated while benchmarking the different settings to test locally. 595 optional BenchmarkEvent benchmark_event = 4; 596} 597 598// How to access the model for mini-benchmark. 599// Since mini-benchmark runs in a separate process, it can not access an 600// in-memory model. It can read the model either from a file or from a file 601// descriptor. The file descriptor typically comes from the Android asset 602// manager. 603// 604// Users should set either filename, or all of fd, offset and length. 605message ModelFile { 606 // Filename for reading model from. 607 optional string filename = 1; 608 // File descriptor to read model from. 609 optional int64 fd = 2; 610 // Offset for model in file descriptor. 611 optional int64 offset = 3; 612 // Length of model in file descriptor. 613 optional int64 length = 4; 614} 615 616// Where to store mini-benchmark state. 617message BenchmarkStoragePaths { 618 // Base path to the files used to to store benchmark results in. Two files 619 // will be generated: one with the given path and an extra file to store 620 // events related to best acceleration results at path storage_file_path + 621 // ".extra.fb". Must be specific to the model. 622 // Note on Android, this should be the code cache directory. 623 optional string storage_file_path = 1; 624 625 // Path to a directory for intermediate files (lock files, extracted 626 // binaries). 627 // Note on Android, this typically is the data cache directory (i.e. the one 628 // returned by `getCacheDir()`). 629 optional string data_directory_path = 2; 630} 631 632// How to run a minibenchmark. 633message MinibenchmarkSettings { 634 // Which settings to test. This would typically be filled in from an 635 // allowlist. 636 repeated TFLiteSettings settings_to_test = 1; 637 // How to access the model. This would typically be set dynamically, as it 638 // depends on the application folder and/or runtime state. 639 optional ModelFile model_file = 2; 640 // Where to store state. This would typically be set dynamically, as it 641 // depends on the application folder. 642 optional BenchmarkStoragePaths storage_paths = 3; 643} 644