1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
17
18 #include <EGL/egl.h>
19 #include <GLES3/gl31.h>
20
21 #include <algorithm>
22 #include <cstdint>
23 #include <cstring>
24 #include <string>
25 #include <unordered_set>
26 #include <vector>
27
28 #include "absl/types/span.h"
29 #include "tensorflow/lite/builtin_ops.h"
30 #include "tensorflow/lite/c/common.h"
31 #include "tensorflow/lite/delegates/gpu/common/convert.h"
32 #include "tensorflow/lite/delegates/gpu/common/model.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
34 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
35 #include "tensorflow/lite/delegates/gpu/common/shape.h"
36 #include "tensorflow/lite/delegates/gpu/common/status.h"
37 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
38 #include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
39 #include "tensorflow/lite/delegates/gpu/gl/api.h"
40 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
41 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
42 #include "tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h"
43 #include "tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h"
44 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
45 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
46 #include "tensorflow/lite/delegates/gpu/gl/kernels/registry.h"
47 #include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
48 #include "tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h"
49 #include "tensorflow/lite/minimal_logging.h"
50
51 #ifndef TFLITE_GPU_BINARY_RELEASE
52 #include "flatbuffers/flatbuffers.h" // from @flatbuffers
53 #include "tensorflow/lite/delegates/gpu/gl/metadata_generated.h"
54 #include "tensorflow/lite/schema/schema_generated.h"
55 #endif // TFLITE_GPU_BINARY_RELEASE
56
57 namespace tflite {
58 namespace gpu {
59 namespace gl {
60 namespace {
61
62 // Forward declarations.
63 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
64 TfLiteStatus DelegateCopyFromBufferHandle(
65 TfLiteContext* context, TfLiteDelegate* delegate,
66 TfLiteBufferHandle buffer_handle, // ValueId
67 TfLiteTensor* tensor);
68 TfLiteStatus DelegateCopyToBufferHandle(
69 TfLiteContext* context, TfLiteDelegate* delegate,
70 TfLiteBufferHandle buffer_handle, // ValueId
71 TfLiteTensor* tensor);
72
IsPHWC4(const BHWC & shape)73 inline bool IsPHWC4(const BHWC& shape) {
74 return shape.c == 4 || (shape.h == 1 && shape.w == 1 && shape.c % 4 == 0);
75 }
76
77 class Delegate {
78 struct ValueRef {
79 BHWC shape;
80 int tensor_index;
81 };
82
83 public:
Delegate(const TfLiteGpuDelegateOptions * options)84 explicit Delegate(const TfLiteGpuDelegateOptions* options) {
85 if (options) {
86 options_ = *options;
87 } else {
88 // Default options.
89 options_.metadata = nullptr;
90 options_.compile_options.precision_loss_allowed = 0;
91 options_.compile_options.preferred_gl_object_type =
92 TFLITE_GL_OBJECT_TYPE_FASTEST;
93 options_.compile_options.dynamic_batch_enabled = 0;
94 }
95 }
96
CopyFromBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor)97 absl::Status CopyFromBufferHandle(TfLiteBufferHandle handle,
98 TfLiteTensor* tensor) {
99 ValueRef ref;
100 RETURN_IF_ERROR(FindObject(handle, &ref));
101 auto buffer = phwc4_objects_.FindBuffer(handle);
102 return buffer->MappedRead<float>([&](absl::Span<const float> data) {
103 tensor->data_is_stale = false;
104 return ConvertFromPHWC4(
105 data, ref.shape,
106 absl::MakeSpan(tensor->data.f, tensor->bytes / sizeof(float)));
107 });
108 }
109
CopyToBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor) const110 absl::Status CopyToBufferHandle(TfLiteBufferHandle handle,
111 TfLiteTensor* tensor) const {
112 ValueRef ref;
113 RETURN_IF_ERROR(FindObject(handle, &ref));
114 auto buffer = phwc4_objects_.FindBuffer(handle);
115 return buffer->MappedWrite<float>([&](absl::Span<float> data) {
116 return ConvertToPHWC4(
117 absl::MakeConstSpan(tensor->data.f, tensor->bytes / sizeof(float)),
118 ref.shape, data);
119 });
120 }
121
BindBufferToTensor(GLuint ssbo,int tensor_index)122 absl::Status BindBufferToTensor(GLuint ssbo, int tensor_index) {
123 int64_t bytes_size;
124 RETURN_IF_ERROR(GetSSBOSize(ssbo, &bytes_size));
125 return bhwc_objects_.RegisterBuffer(
126 tensor_index, GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo, bytes_size,
127 /* offset = */ 0,
128 /* has_ownership = */ false));
129 }
130
Prepare(TfLiteContext * context,const TfLiteDelegateParams * delegate_params)131 absl::Status Prepare(TfLiteContext* context,
132 const TfLiteDelegateParams* delegate_params) {
133 // Extract TFLite delegate execution plan from the context and convert it
134 // into GraphFloat32.
135 GraphFloat32 graph;
136 RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
137
138 // Apply general transformations on the graph.
139 ModelTransformer transformer(&graph);
140 if (!ApplyModelTransformations(&transformer)) {
141 return absl::InternalError("Graph transformations failed");
142 }
143
144 if (!env_) RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env_));
145
146 // TODO(impjdi): Remove code duplication.
147 auto values = graph.values();
148 auto find_value = [&](int tensor_index) -> Value* {
149 for (auto value : values) {
150 if (value->tensor.ref == tensor_index) return value;
151 }
152 return nullptr;
153 };
154 tensors_.reserve(values.back()->id + 1);
155 for (auto value : values) {
156 if (tensors_.size() <= value->id) {
157 tensors_.resize(value->id + 1);
158 }
159 tensors_[value->id] = {value->tensor.shape, 0};
160 }
161
162 std::unordered_set<int> tflite_graph_io; // NOLINT
163
164 // Prepare graph inputs.
165 //
166 // Note that graph.inputs() cannot be used directly, as the notion of
167 // graph input has a different meaning in public API and GPU-internal API.
168 {
169 inputs_.clear();
170 inputs_.reserve(delegate_params->input_tensors->size);
171 for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
172 const int tensor_index = delegate_params->input_tensors->data[i];
173 auto* tensor = context->tensors + tensor_index;
174 if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) {
175 continue;
176 }
177 tflite_graph_io.insert(tensor_index);
178 const auto* input = find_value(tensor_index);
179 if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
180 return absl::NotFoundError("Input tensor is not found in the graph.");
181 }
182
183 inputs_.push_back(input->id);
184 tensor->buffer_handle = input->id;
185 tensor->delegate = &delegate_;
186 tensors_[input->id].tensor_index = tensor_index;
187
188 // Create phwc4 input buffer.
189 // Check whether there is externally provided object is already in
190 // PHWC4. If yes, we may skip conversion step.
191 // We need to keep same buffer in bhwc_objects_ to indicate there is
192 // externally provided buffer.
193 auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
194 GlBuffer buffer;
195 if (IsPHWC4(input->tensor.shape) && external_buffer) {
196 buffer = external_buffer->MakeRef();
197 } else {
198 RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
199 GetElementsSizeForPHWC4(input->tensor.shape), &buffer));
200 }
201 RETURN_IF_ERROR(
202 phwc4_objects_.RegisterBuffer(input->id, std::move(buffer)));
203 }
204 }
205
206 // Prepare graph outputs.
207 //
208 // Note that graph.outputs() cannot be used directly, as the notion of
209 // graph output has a different meaning in public API and GPU-internal API.
210 {
211 outputs_.clear();
212 outputs_.reserve(delegate_params->output_tensors->size);
213 for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
214 const int tensor_index = delegate_params->output_tensors->data[i];
215 auto* tensor = context->tensors + tensor_index;
216 tflite_graph_io.insert(tensor_index);
217 const auto* output = find_value(tensor_index);
218 if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
219 return absl::NotFoundError(
220 "Output tensor is not found in the graph.");
221 }
222
223 outputs_.push_back(output->id);
224 tensor->buffer_handle = output->id;
225 tensor->delegate = &delegate_;
226 tensors_[output->id].tensor_index = tensor_index;
227
228 // Create phwc4 output buffer.
229 // Check whether there is externally provided object is already in
230 // PHWC4. If yes, we may skip conversion step.
231 auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
232 GlBuffer buffer;
233 if (IsPHWC4(output->tensor.shape) && external_buffer) {
234 buffer = external_buffer->MakeRef();
235 } else {
236 RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
237 GetElementsSizeForPHWC4(output->tensor.shape), &buffer));
238 }
239 RETURN_IF_ERROR(
240 phwc4_objects_.RegisterBuffer(output->id, std::move(buffer)));
241 }
242 }
243
244 // Create shaders to convert from/to phwc4.
245 RETURN_IF_ERROR(ConverterBhwcToPhwc4::Create(&bhwc_to_phwc4_));
246 RETURN_IF_ERROR(ConverterPhwc4ToBhwc::Create(&phwc4_to_bhwc_));
247
248 // Compile model.
249 CompilationOptions compile_options;
250 compile_options.allow_precision_loss =
251 static_cast<bool>(options_.compile_options.precision_loss_allowed);
252 compile_options.preferred_obj_type = static_cast<ObjectType>(
253 options_.compile_options.preferred_gl_object_type);
254 compile_options.ref_obj_type = static_cast<ObjectType>(
255 options_.compile_options.preferred_gl_object_type);
256 compile_options.dynamic_batch =
257 static_cast<bool>(options_.compile_options.dynamic_batch_enabled);
258 compile_options.inline_parameters =
259 static_cast<bool>(options_.compile_options.inline_parameters);
260 auto shaders = NewNodeShaderRegistry();
261 GpuInfo gpu_info;
262 RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
263 command_queue_ = NewCommandQueue(gpu_info);
264 auto workgroups_calculator =
265 BestEffortWorkgroupsCalculator(options_.metadata, gpu_info);
266 std::unique_ptr<CompiledModel> compiled_model;
267 RETURN_IF_ERROR(Compile(compile_options, graph, tflite_graph_io, *shaders,
268 *workgroups_calculator, &compiled_model));
269
270 // Create inference context.
271 const RuntimeOptions runtime_options;
272 RETURN_IF_ERROR(compiled_model->NewRun(runtime_options, &phwc4_objects_,
273 command_queue_.get(),
274 &inference_context_));
275 return absl::OkStatus();
276 }
277
Invoke(TfLiteContext * context)278 absl::Status Invoke(TfLiteContext* context) {
279 const EGLContext egl_context_at_delegate_init = env_->context().context();
280 const EGLContext egl_context_at_delegate_invoke = eglGetCurrentContext();
281 if (egl_context_at_delegate_init != egl_context_at_delegate_invoke) {
282 return absl::FailedPreconditionError(
283 "Delegate should run on the same thread where it was initialized.");
284 }
285
286 // Push input data from a tensor to GPU.
287 for (ValueId id : inputs_) {
288 const ValueRef& ref = tensors_[id];
289 auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
290 if (external_object) {
291 // Use input from GPU.
292 // Conversion is needed only when external object is not phwc4.
293 if (!IsPHWC4(tensors_[id].shape)) {
294 RETURN_IF_ERROR(bhwc_to_phwc4_.Convert(
295 ref.shape, *external_object, command_queue_.get(),
296 phwc4_objects_.FindBuffer(id)));
297 }
298 } else {
299 // Copy from CPU to GPU
300 TfLiteTensor& tensor = context->tensors[ref.tensor_index];
301 RETURN_IF_ERROR(CopyToBufferHandle(id, &tensor));
302 }
303 }
304
305 // Run inference.
306 RETURN_IF_ERROR(inference_context_->Reset());
307 RETURN_IF_ERROR(inference_context_->Execute());
308
309 // Push output data from GPU to a tensor.
310 bool finished_gpu_processing = false;
311 for (ValueId id : outputs_) {
312 const ValueRef& ref = tensors_[id];
313 auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
314 if (external_object) {
315 // Convert data from PHWC4 to BHWC and leave it in GPU object.
316 // Conversion is needed only when external object is not phwc4.
317 if (!IsPHWC4(tensors_[id].shape)) {
318 RETURN_IF_ERROR(
319 phwc4_to_bhwc_.Convert(ref.shape, *phwc4_objects_.FindBuffer(id),
320 command_queue_.get(), external_object));
321 }
322 } else {
323 // Wait until all GPU command are completed. This call leads to a lower
324 // processing latency because a buffer reading below will not stall if
325 // data is not yet ready.
326 if (!finished_gpu_processing) {
327 RETURN_IF_ERROR(command_queue_->WaitForCompletion());
328 finished_gpu_processing = true;
329 }
330 // Copy from GPU to CPU.
331 TfLiteTensor& tensor = context->tensors[ref.tensor_index];
332 RETURN_IF_ERROR(CopyFromBufferHandle(id, &tensor));
333 }
334 }
335 return absl::OkStatus();
336 }
337
tflite_delegate()338 TfLiteDelegate* tflite_delegate() { return &delegate_; }
339
340 private:
FindObject(ValueId id,ValueRef * ref) const341 absl::Status FindObject(ValueId id, ValueRef* ref) const {
342 if (id >= tensors_.size()) {
343 return absl::InvalidArgumentError("Invalid buffer id");
344 }
345 *ref = tensors_[id];
346 return absl::OkStatus();
347 }
348
349 TfLiteDelegate delegate_ = {
350 reinterpret_cast<void*>(this), // .data_
351 DelegatePrepare, // .Prepare
352 DelegateCopyFromBufferHandle, // .CopyFromBufferHandle
353 DelegateCopyToBufferHandle, // .CopyToBufferHandle
354 nullptr, // .FreeBufferHandle
355 kTfLiteDelegateFlagsNone, // .flags
356 };
357
358 TfLiteGpuDelegateOptions options_;
359
360 std::unique_ptr<EglEnvironment> env_;
361 std::vector<ValueRef> tensors_; // indexed by ValueId
362 std::vector<ValueId> inputs_;
363 std::vector<ValueId> outputs_;
364 ObjectManager phwc4_objects_;
365 ObjectManager bhwc_objects_; // key is tensor_index
366 ConverterPhwc4ToBhwc phwc4_to_bhwc_;
367 ConverterBhwcToPhwc4 bhwc_to_phwc4_;
368 std::unique_ptr<CommandQueue> command_queue_;
369 std::unique_ptr<InferenceContext> inference_context_;
370 };
371
GetGpuDelegate(TfLiteNode * node)372 inline Delegate* GetGpuDelegate(TfLiteNode* node) {
373 return reinterpret_cast<Delegate*>(node->user_data);
374 }
375
GetGpuDelegate(TfLiteDelegate * delegate)376 inline Delegate* GetGpuDelegate(TfLiteDelegate* delegate) {
377 return reinterpret_cast<Delegate*>(delegate->data_);
378 }
379
DelegatePrepare(TfLiteContext * context,TfLiteDelegate * delegate)380 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
381 const TfLiteRegistration kRegistration = {
382 // .init
383 [](TfLiteContext* context, const char* buffer, size_t) -> void* {
384 const auto* params =
385 reinterpret_cast<const TfLiteDelegateParams*>(buffer);
386 auto* gpu_delegate = GetGpuDelegate(params->delegate);
387 // Everything below should happen in prepare function call, but TFLite
388 // for whatever reason forbids that.
389 const auto status = gpu_delegate->Prepare(context, params);
390 if (status.ok()) return gpu_delegate;
391 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
392 std::string(status.message()).c_str());
393 return nullptr;
394 },
395 // .free
396 [](TfLiteContext*, void* buffer) -> void {},
397 // .prepare
398 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
399 return node->user_data ? kTfLiteOk : kTfLiteError;
400 },
401 // .invoke
402 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
403 const auto status = GetGpuDelegate(node)->Invoke(context);
404 if (status.ok()) return kTfLiteOk;
405 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
406 std::string(status.message()).c_str());
407 return kTfLiteError;
408 },
409 nullptr, // .profiling_string
410 0, // .builtin_code
411 "TfLiteGpuDelegate", // .custom_name
412 1, // .version
413 };
414 TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
415 const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
416 context, kRegistration, ops_to_replace, delegate);
417 TfLiteIntArrayFree(ops_to_replace);
418 return status;
419 }
420
DelegateCopyFromBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)421 TfLiteStatus DelegateCopyFromBufferHandle(TfLiteContext* context,
422 TfLiteDelegate* delegate,
423 TfLiteBufferHandle buffer_handle,
424 TfLiteTensor* tensor) {
425 auto* gpu_delegate = GetGpuDelegate(delegate);
426 if (!gpu_delegate) return kTfLiteError;
427 const auto status = gpu_delegate->CopyFromBufferHandle(buffer_handle, tensor);
428 if (status.ok()) return kTfLiteOk;
429 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate CopyFromBufferHandle: %s",
430 std::string(status.message()).c_str());
431 return kTfLiteError;
432 }
433
DelegateCopyToBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)434 TfLiteStatus DelegateCopyToBufferHandle(TfLiteContext* context,
435 TfLiteDelegate* delegate,
436 TfLiteBufferHandle buffer_handle,
437 TfLiteTensor* tensor) {
438 auto* gpu_delegate = GetGpuDelegate(delegate);
439 if (!gpu_delegate) return kTfLiteError;
440 const auto status = gpu_delegate->CopyToBufferHandle(buffer_handle, tensor);
441 if (status.ok()) return kTfLiteOk;
442 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate CopyToBufferHandle: %s",
443 std::string(status.message()).c_str());
444 return kTfLiteError;
445 }
446
447 } // namespace
448 } // namespace gl
449 } // namespace gpu
450 } // namespace tflite
451
TfLiteGlCompileOptionsDefault()452 TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault() {
453 TfLiteGlCompileOptions options;
454 options.precision_loss_allowed = 0;
455 options.preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
456 options.dynamic_batch_enabled = 0;
457 options.inline_parameters = 0;
458 return options;
459 }
460
TfLiteGpuDelegateOptionsDefault()461 TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault() {
462 TfLiteGpuDelegateOptions options;
463 options.metadata = nullptr;
464 options.compile_options = TfLiteGlCompileOptionsDefault();
465 return options;
466 }
467
TfLiteGpuDelegateCreate(const TfLiteGpuDelegateOptions * options)468 TfLiteDelegate* TfLiteGpuDelegateCreate(
469 const TfLiteGpuDelegateOptions* options) {
470 TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
471 "Created TensorFlow Lite delegate for GPU.");
472 auto* gpu_delegate = new tflite::gpu::gl::Delegate(options);
473 return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
474 }
475
TfLiteGpuDelegateDelete(TfLiteDelegate * delegate)476 void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate) {
477 delete tflite::gpu::gl::GetGpuDelegate(delegate);
478 }
479
TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate * delegate,GLuint buffer,int tensor_index)480 TfLiteStatus TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate* delegate,
481 GLuint buffer,
482 int tensor_index) {
483 auto* gpu_delegate = tflite::gpu::gl::GetGpuDelegate(delegate);
484 return gpu_delegate &&
485 gpu_delegate->BindBufferToTensor(buffer, tensor_index).ok()
486 ? kTfLiteOk
487 : kTfLiteError;
488 }
489
490 #ifndef TFLITE_GPU_BINARY_RELEASE
TfLiteGpuDelegateGetModelMetadata(const void * tflite_model)491 const uint8_t* TfLiteGpuDelegateGetModelMetadata(const void* tflite_model) {
492 const auto* model = reinterpret_cast<const tflite::Model*>(tflite_model);
493 if (!model || !model->metadata_buffer() || !model->buffers()) return nullptr;
494 for (int32_t buffer_index : *model->metadata_buffer()) {
495 if (buffer_index < 0 && buffer_index >= model->buffers()->size()) continue;
496 const tflite::Buffer* buffer = model->buffers()->Get(buffer_index);
497 if (!buffer) continue;
498 const uint8_t* data = buffer->data()->data();
499 if (!flatbuffers::BufferHasIdentifier(
500 data, tflite::gpu::gl::data::FlowMetadataIdentifier())) {
501 continue;
502 }
503 flatbuffers::Verifier verifier(data, buffer->data()->size());
504 return tflite::gpu::gl::data::VerifyFlowMetadataBuffer(verifier) ? data
505 : nullptr;
506 }
507 return nullptr;
508 }
509 #endif // TFLITE_GPU_BINARY_RELEASE
510