• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
17 
18 #include <algorithm>
19 #include <cstdint>
20 #include <cstring>
21 #include <string>
22 #include <unordered_set>
23 #include <vector>
24 
25 #include <EGL/egl.h>
26 #include <GLES3/gl31.h>
27 #include "absl/types/span.h"
28 #include "tensorflow/lite/builtin_ops.h"
29 #include "tensorflow/lite/c/common.h"
30 #include "tensorflow/lite/delegates/gpu/common/convert.h"
31 #include "tensorflow/lite/delegates/gpu/common/model.h"
32 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
34 #include "tensorflow/lite/delegates/gpu/common/shape.h"
35 #include "tensorflow/lite/delegates/gpu/common/status.h"
36 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
37 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
38 #include "tensorflow/lite/delegates/gpu/gl/api.h"
39 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
40 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
41 #include "tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h"
42 #include "tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h"
43 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
44 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
45 #include "tensorflow/lite/delegates/gpu/gl/kernels/registry.h"
46 #include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
47 #include "tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h"
48 #include "tensorflow/lite/minimal_logging.h"
49 
50 #ifndef TFLITE_GPU_BINARY_RELEASE
51 #include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
52 #include "tensorflow/lite/delegates/gpu/gl/metadata_generated.h"
53 #include "tensorflow/lite/schema/schema_generated.h"
54 #endif  // TFLITE_GPU_BINARY_RELEASE
55 
56 namespace tflite {
57 namespace gpu {
58 namespace gl {
59 namespace {
60 
61 // Forward declarations.
62 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
63 TfLiteStatus DelegateCopyFromBufferHandle(
64     TfLiteContext* context, TfLiteDelegate* delegate,
65     TfLiteBufferHandle buffer_handle,  // ValueId
66     TfLiteTensor* tensor);
67 TfLiteStatus DelegateCopyToBufferHandle(
68     TfLiteContext* context, TfLiteDelegate* delegate,
69     TfLiteBufferHandle buffer_handle,  // ValueId
70     TfLiteTensor* tensor);
71 
IsPHWC4(const BHWC & shape)72 inline bool IsPHWC4(const BHWC& shape) {
73   return shape.c == 4 || (shape.h == 1 && shape.w == 1 && shape.c % 4 == 0);
74 }
75 
76 class Delegate {
77   struct ValueRef {
78     BHWC shape;
79     int tensor_index;
80   };
81 
82  public:
Delegate(const TfLiteGpuDelegateOptions * options)83   explicit Delegate(const TfLiteGpuDelegateOptions* options) {
84     if (options) {
85       options_ = *options;
86     } else {
87       // Default options.
88       options_.metadata = nullptr;
89       options_.compile_options.precision_loss_allowed = 0;
90       options_.compile_options.preferred_gl_object_type =
91           TFLITE_GL_OBJECT_TYPE_FASTEST;
92       options_.compile_options.dynamic_batch_enabled = 0;
93     }
94   }
95 
CopyFromBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor)96   Status CopyFromBufferHandle(TfLiteBufferHandle handle, TfLiteTensor* tensor) {
97     ValueRef ref;
98     RETURN_IF_ERROR(FindObject(handle, &ref));
99     auto buffer = phwc4_objects_.FindBuffer(handle);
100     return buffer->MappedRead<float>([&](absl::Span<const float> data) {
101       tensor->data_is_stale = false;
102       return ConvertFromPHWC4(
103           data, ref.shape,
104           absl::MakeSpan(tensor->data.f, tensor->bytes / sizeof(float)));
105     });
106   }
107 
CopyToBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor) const108   Status CopyToBufferHandle(TfLiteBufferHandle handle,
109                             TfLiteTensor* tensor) const {
110     ValueRef ref;
111     RETURN_IF_ERROR(FindObject(handle, &ref));
112     auto buffer = phwc4_objects_.FindBuffer(handle);
113     return buffer->MappedWrite<float>([&](absl::Span<float> data) {
114       return ConvertToPHWC4(
115           absl::MakeConstSpan(tensor->data.f, tensor->bytes / sizeof(float)),
116           ref.shape, data);
117     });
118   }
119 
BindBufferToTensor(GLuint ssbo,int tensor_index)120   Status BindBufferToTensor(GLuint ssbo, int tensor_index) {
121     int64_t bytes_size;
122     RETURN_IF_ERROR(GetSSBOSize(ssbo, &bytes_size));
123     return bhwc_objects_.RegisterBuffer(
124         tensor_index, GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo, bytes_size,
125                                /* offset = */ 0,
126                                /* has_ownership = */ false));
127   }
128 
Prepare(TfLiteContext * context,const TfLiteDelegateParams * delegate_params)129   Status Prepare(TfLiteContext* context,
130                  const TfLiteDelegateParams* delegate_params) {
131     // Extract TFLite delegate execution plan from the context and convert it
132     // into FlowGraph32.
133     GraphFloat32 graph;
134     RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
135 
136     // Apply general transformations on the graph.
137     NullTransformationReporter reporter;
138     ModelTransformer transformer(&graph, &reporter);
139     if (!ApplyGeneralTransformations(&transformer)) {
140       return InternalError("Graph general transformations failed");
141     }
142 
143     if (!env_) RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env_));
144 
145     // TODO(impjdi): Remove code duplication.
146     auto values = graph.values();
147     auto find_value = [&](int tensor_index) -> Value<TensorRef<BHWC>>* {
148       for (auto value : values) {
149         if (value->tensor.ref == tensor_index) return value;
150       }
151       return nullptr;
152     };
153     tensors_.reserve(values.back()->id + 1);
154     for (auto value : values) {
155       if (tensors_.size() <= value->id) {
156         tensors_.resize(value->id + 1);
157       }
158       tensors_[value->id] = {value->tensor.shape, 0};
159     }
160 
161     std::unordered_set<int> tflite_graph_io;
162 
163     // Prepare graph inputs.
164     //
165     // Note that graph.inputs() cannot be used directly, as the notion of
166     // graph input has a different meaning in public API and GPU-internal API.
167     {
168       inputs_.clear();
169       inputs_.reserve(delegate_params->input_tensors->size);
170       for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
171         const int tensor_index = delegate_params->input_tensors->data[i];
172         auto* tensor = context->tensors + tensor_index;
173         if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) {
174           continue;
175         }
176         tflite_graph_io.insert(tensor_index);
177         const auto* input = find_value(tensor_index);
178         if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
179           return NotFoundError("Input tensor is not found in the graph.");
180         }
181 
182         inputs_.push_back(input->id);
183         tensor->buffer_handle = input->id;
184         tensor->delegate = &delegate_;
185         tensors_[input->id].tensor_index = tensor_index;
186 
187         // Create phwc4 input buffer.
188         // Check whether there is externally provided object is already in
189         // PHWC4. If yes, we may skip conversion step.
190         // We need to keep same buffer in bhwc_objects_ to indicate there is
191         // externally provided buffer.
192         auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
193         GlBuffer buffer;
194         if (IsPHWC4(input->tensor.shape) && external_buffer) {
195           buffer = external_buffer->MakeRef();
196         } else {
197           RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
198               GetElementsSizeForPHWC4(input->tensor.shape), &buffer));
199         }
200         RETURN_IF_ERROR(
201             phwc4_objects_.RegisterBuffer(input->id, std::move(buffer)));
202       }
203     }
204 
205     // Prepare graph outputs.
206     //
207     // Note that graph.outputs() cannot be used directly, as the notion of
208     // graph output has a different meaning in public API and GPU-internal API.
209     {
210       outputs_.clear();
211       outputs_.reserve(delegate_params->output_tensors->size);
212       for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
213         const int tensor_index = delegate_params->output_tensors->data[i];
214         auto* tensor = context->tensors + tensor_index;
215         tflite_graph_io.insert(tensor_index);
216         const auto* output = find_value(tensor_index);
217         if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
218           return NotFoundError("Output tensor is not found in the graph.");
219         }
220 
221         outputs_.push_back(output->id);
222         tensor->buffer_handle = output->id;
223         tensor->delegate = &delegate_;
224         tensors_[output->id].tensor_index = tensor_index;
225 
226         // Create phwc4 output buffer.
227         // Check whether there is externally provided object is already in
228         // PHWC4. If yes, we may skip conversion step.
229         auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
230         GlBuffer buffer;
231         if (IsPHWC4(output->tensor.shape) && external_buffer) {
232           buffer = external_buffer->MakeRef();
233         } else {
234           RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
235               GetElementsSizeForPHWC4(output->tensor.shape), &buffer));
236         }
237         RETURN_IF_ERROR(
238             phwc4_objects_.RegisterBuffer(output->id, std::move(buffer)));
239       }
240     }
241 
242     // Create shaders to convert from/to phwc4.
243     RETURN_IF_ERROR(ConverterBhwcToPhwc4::Create(&bhwc_to_phwc4_));
244     RETURN_IF_ERROR(ConverterPhwc4ToBhwc::Create(&phwc4_to_bhwc_));
245 
246     // Compile model.
247     CompilationOptions compile_options;
248     compile_options.allow_precision_loss =
249         static_cast<bool>(options_.compile_options.precision_loss_allowed);
250     compile_options.preferred_obj_type = static_cast<ObjectType>(
251         options_.compile_options.preferred_gl_object_type);
252     compile_options.ref_obj_type = static_cast<ObjectType>(
253         options_.compile_options.preferred_gl_object_type);
254     compile_options.dynamic_batch =
255         static_cast<bool>(options_.compile_options.dynamic_batch_enabled);
256     compile_options.inline_parameters =
257         static_cast<bool>(options_.compile_options.inline_parameters);
258     auto shaders = NewNodeShaderRegistry();
259     GpuInfo gpu_info;
260     RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
261     command_queue_ = NewCommandQueue(gpu_info);
262     auto workgroups_calculator =
263         BestEffortWorkgroupsCalculator(options_.metadata, gpu_info);
264     std::unique_ptr<CompiledModel> compiled_model;
265     RETURN_IF_ERROR(Compile(compile_options, graph, tflite_graph_io, *shaders,
266                             *workgroups_calculator, &compiled_model));
267 
268     // Create inference context.
269     const RuntimeOptions runtime_options;
270     RETURN_IF_ERROR(compiled_model->NewRun(runtime_options, &phwc4_objects_,
271                                            command_queue_.get(),
272                                            &inference_context_));
273     return OkStatus();
274   }
275 
Invoke(TfLiteContext * context)276   Status Invoke(TfLiteContext* context) {
277     const EGLContext egl_context_at_delegate_init = env_->context().context();
278     const EGLContext egl_context_at_delegate_invoke = eglGetCurrentContext();
279     if (egl_context_at_delegate_init != egl_context_at_delegate_invoke) {
280       return FailedPreconditionError(
281           "Delegate should run on the same thread where it was initialized.");
282     }
283 
284     // Push input data from a tensor to GPU.
285     for (ValueId id : inputs_) {
286       const ValueRef& ref = tensors_[id];
287       auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
288       if (external_object) {
289         // Use input from GPU.
290         // Conversion is needed only when external object is not phwc4.
291         if (!IsPHWC4(tensors_[id].shape)) {
292           RETURN_IF_ERROR(bhwc_to_phwc4_.Convert(
293               ref.shape, *external_object, command_queue_.get(),
294               phwc4_objects_.FindBuffer(id)));
295         }
296       } else {
297         // Copy from CPU to GPU
298         TfLiteTensor& tensor = context->tensors[ref.tensor_index];
299         RETURN_IF_ERROR(CopyToBufferHandle(id, &tensor));
300       }
301     }
302 
303     // Run inference.
304     RETURN_IF_ERROR(inference_context_->Reset());
305     RETURN_IF_ERROR(inference_context_->Execute());
306 
307     // Push output data from GPU to a tensor.
308     bool finished_gpu_processing = false;
309     for (ValueId id : outputs_) {
310       const ValueRef& ref = tensors_[id];
311       auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
312       if (external_object) {
313         // Convert data from PHWC4 to BHWC and leave it in GPU object.
314         // Conversion is needed only when external object is not phwc4.
315         if (!IsPHWC4(tensors_[id].shape)) {
316           RETURN_IF_ERROR(
317               phwc4_to_bhwc_.Convert(ref.shape, *phwc4_objects_.FindBuffer(id),
318                                      command_queue_.get(), external_object));
319         }
320       } else {
321         // Wait until all GPU command are completed. This call leads to a lower
322         // processing latency because a buffer reading below will not stall if
323         // data is not yet ready.
324         if (!finished_gpu_processing) {
325           RETURN_IF_ERROR(command_queue_->WaitForCompletion());
326           finished_gpu_processing = true;
327         }
328         // Copy from GPU to CPU.
329         TfLiteTensor& tensor = context->tensors[ref.tensor_index];
330         RETURN_IF_ERROR(CopyFromBufferHandle(id, &tensor));
331       }
332     }
333     return OkStatus();
334   }
335 
tflite_delegate()336   TfLiteDelegate* tflite_delegate() { return &delegate_; }
337 
338  private:
FindObject(ValueId id,ValueRef * ref) const339   Status FindObject(ValueId id, ValueRef* ref) const {
340     if (id >= tensors_.size()) {
341       return InvalidArgumentError("Invalid buffer id");
342     }
343     *ref = tensors_[id];
344     return OkStatus();
345   }
346 
347   TfLiteDelegate delegate_ = {
348       reinterpret_cast<void*>(this),  // .data_
349       DelegatePrepare,                // .Prepare
350       DelegateCopyFromBufferHandle,   // .CopyFromBufferHandle
351       DelegateCopyToBufferHandle,     // .CopyToBufferHandle
352       nullptr,                        // .FreeBufferHandle
353       kTfLiteDelegateFlagsNone,       // .flags
354   };
355 
356   TfLiteGpuDelegateOptions options_;
357 
358   std::unique_ptr<EglEnvironment> env_;
359   std::vector<ValueRef> tensors_;  // indexed by ValueId
360   std::vector<ValueId> inputs_;
361   std::vector<ValueId> outputs_;
362   ObjectManager phwc4_objects_;
363   ObjectManager bhwc_objects_;  // key is tensor_index
364   ConverterPhwc4ToBhwc phwc4_to_bhwc_;
365   ConverterBhwcToPhwc4 bhwc_to_phwc4_;
366   std::unique_ptr<CommandQueue> command_queue_;
367   std::unique_ptr<InferenceContext> inference_context_;
368 };
369 
GetGpuDelegate(TfLiteNode * node)370 inline Delegate* GetGpuDelegate(TfLiteNode* node) {
371   return reinterpret_cast<Delegate*>(node->user_data);
372 }
373 
GetGpuDelegate(TfLiteDelegate * delegate)374 inline Delegate* GetGpuDelegate(TfLiteDelegate* delegate) {
375   return reinterpret_cast<Delegate*>(delegate->data_);
376 }
377 
DelegatePrepare(TfLiteContext * context,TfLiteDelegate * delegate)378 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
379   const TfLiteRegistration kRegistration = {
380       // .init
381       [](TfLiteContext* context, const char* buffer, size_t) -> void* {
382         const auto* params =
383             reinterpret_cast<const TfLiteDelegateParams*>(buffer);
384         auto* gpu_delegate = GetGpuDelegate(params->delegate);
385         // Everything below should happen in prepare function call, but TFLite
386         // for whatever reason forbids that.
387         const auto status = gpu_delegate->Prepare(context, params);
388         if (status.ok()) return gpu_delegate;
389         context->ReportError(context, "TfLiteGpuDelegate Prepare: %s",
390                              status.error_message().c_str());
391         return nullptr;
392       },
393       // .free
394       [](TfLiteContext*, void* buffer) -> void {},
395       // .prepare
396       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
397         return node->user_data ? kTfLiteOk : kTfLiteError;
398       },
399       // .invoke
400       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
401         const auto status = GetGpuDelegate(node)->Invoke(context);
402         if (status.ok()) return kTfLiteOk;
403         context->ReportError(context, "TfLiteGpuDelegate Invoke: %s",
404                              status.error_message().c_str());
405         return kTfLiteError;
406       },
407       nullptr,              // .profiling_string
408       0,                    // .builtin_code
409       "TfLiteGpuDelegate",  // .custom_name
410       1,                    // .version
411   };
412   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
413   const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
414       context, kRegistration, ops_to_replace, delegate);
415   TfLiteIntArrayFree(ops_to_replace);
416   return status;
417 }
418 
DelegateCopyFromBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)419 TfLiteStatus DelegateCopyFromBufferHandle(TfLiteContext* context,
420                                           TfLiteDelegate* delegate,
421                                           TfLiteBufferHandle buffer_handle,
422                                           TfLiteTensor* tensor) {
423   auto* gpu_delegate = GetGpuDelegate(delegate);
424   if (!gpu_delegate) return kTfLiteError;
425   const auto status = gpu_delegate->CopyFromBufferHandle(buffer_handle, tensor);
426   if (status.ok()) return kTfLiteOk;
427   context->ReportError(context, "TfLiteGpuDelegate CopyFromBufferHandle: %s",
428                        status.error_message().c_str());
429   return kTfLiteError;
430 }
431 
DelegateCopyToBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)432 TfLiteStatus DelegateCopyToBufferHandle(TfLiteContext* context,
433                                         TfLiteDelegate* delegate,
434                                         TfLiteBufferHandle buffer_handle,
435                                         TfLiteTensor* tensor) {
436   auto* gpu_delegate = GetGpuDelegate(delegate);
437   if (!gpu_delegate) return kTfLiteError;
438   const auto status = gpu_delegate->CopyToBufferHandle(buffer_handle, tensor);
439   if (status.ok()) return kTfLiteOk;
440   context->ReportError(context, "TfLiteGpuDelegate CopyToBufferHandle: %s",
441                        status.error_message().c_str());
442   return kTfLiteError;
443 }
444 
445 }  // namespace
446 }  // namespace gl
447 }  // namespace gpu
448 }  // namespace tflite
449 
TfLiteGlCompileOptionsDefault()450 TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault() {
451   TfLiteGlCompileOptions options;
452   options.precision_loss_allowed = 0;
453   options.preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
454   options.dynamic_batch_enabled = 0;
455   options.inline_parameters = 0;
456   return options;
457 }
458 
TfLiteGpuDelegateOptionsDefault()459 TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault() {
460   TfLiteGpuDelegateOptions options;
461   options.metadata = nullptr;
462   options.compile_options = TfLiteGlCompileOptionsDefault();
463   return options;
464 }
465 
TfLiteGpuDelegateCreate(const TfLiteGpuDelegateOptions * options)466 TfLiteDelegate* TfLiteGpuDelegateCreate(
467     const TfLiteGpuDelegateOptions* options) {
468   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
469                        "Created TensorFlow Lite delegate for GPU.");
470   auto* gpu_delegate = new tflite::gpu::gl::Delegate(options);
471   return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
472 }
473 
TfLiteGpuDelegateDelete(TfLiteDelegate * delegate)474 void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate) {
475   delete tflite::gpu::gl::GetGpuDelegate(delegate);
476 }
477 
TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate * delegate,GLuint buffer,int tensor_index)478 TfLiteStatus TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate* delegate,
479                                                  GLuint buffer,
480                                                  int tensor_index) {
481   auto* gpu_delegate = tflite::gpu::gl::GetGpuDelegate(delegate);
482   return gpu_delegate &&
483                  gpu_delegate->BindBufferToTensor(buffer, tensor_index).ok()
484              ? kTfLiteOk
485              : kTfLiteError;
486 }
487 
488 #ifndef TFLITE_GPU_BINARY_RELEASE
TfLiteGpuDelegateGetModelMetadata(const void * tflite_model)489 const uint8_t* TfLiteGpuDelegateGetModelMetadata(const void* tflite_model) {
490   const auto* model = reinterpret_cast<const tflite::Model*>(tflite_model);
491   if (!model || !model->metadata_buffer() || !model->buffers()) return nullptr;
492   for (int32_t buffer_index : *model->metadata_buffer()) {
493     if (buffer_index < 0 && buffer_index >= model->buffers()->size()) continue;
494     const tflite::Buffer* buffer = model->buffers()->Get(buffer_index);
495     if (!buffer) continue;
496     const uint8_t* data = buffer->data()->data();
497     if (!flatbuffers::BufferHasIdentifier(
498             data, tflite::gpu::gl::data::FlowMetadataIdentifier())) {
499       continue;
500     }
501     flatbuffers::Verifier verifier(data, buffer->data()->size());
502     return tflite::gpu::gl::data::VerifyFlowMetadataBuffer(verifier) ? data
503                                                                      : nullptr;
504   }
505   return nullptr;
506 }
507 #endif  // TFLITE_GPU_BINARY_RELEASE
508