1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
17
18 #include <EGL/egl.h>
19 #include <GLES3/gl31.h>
20
21 #include <algorithm>
22 #include <cstdint>
23 #include <cstring>
24 #include <string>
25 #include <unordered_set>
26 #include <vector>
27
28 #include "absl/types/span.h"
29 #include "tensorflow/lite/builtin_ops.h"
30 #include "tensorflow/lite/c/common.h"
31 #include "tensorflow/lite/delegates/gpu/common/convert.h"
32 #include "tensorflow/lite/delegates/gpu/common/model.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
34 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
35 #include "tensorflow/lite/delegates/gpu/common/shape.h"
36 #include "tensorflow/lite/delegates/gpu/common/status.h"
37 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
38 #include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
39 #include "tensorflow/lite/delegates/gpu/gl/api.h"
40 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
41 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
42 #include "tensorflow/lite/delegates/gpu/gl/converters/bhwc_to_phwc4.h"
43 #include "tensorflow/lite/delegates/gpu/gl/converters/phwc4_to_bhwc.h"
44 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
45 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
46 #include "tensorflow/lite/delegates/gpu/gl/kernels/registry.h"
47 #include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
48 #include "tensorflow/lite/delegates/gpu/gl/workgroups/best_effort_calculator.h"
49 #include "tensorflow/lite/minimal_logging.h"
50
51 #ifndef TFLITE_GPU_BINARY_RELEASE
52 #include "flatbuffers/flatbuffers.h" // from @flatbuffers
53 #include "tensorflow/lite/delegates/gpu/gl/metadata_generated.h"
54 #include "tensorflow/lite/schema/schema_generated.h"
55 #endif // TFLITE_GPU_BINARY_RELEASE
56
57 namespace tflite {
58 namespace gpu {
59 namespace gl {
60 namespace {
61
62 // Forward declarations.
63 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
64 TfLiteStatus DelegateCopyFromBufferHandle(
65 TfLiteContext* context, TfLiteDelegate* delegate,
66 TfLiteBufferHandle buffer_handle, // ValueId
67 TfLiteTensor* tensor);
68 TfLiteStatus DelegateCopyToBufferHandle(
69 TfLiteContext* context, TfLiteDelegate* delegate,
70 TfLiteBufferHandle buffer_handle, // ValueId
71 TfLiteTensor* tensor);
72
IsPHWC4(const BHWC & shape)73 inline bool IsPHWC4(const BHWC& shape) {
74 return shape.c == 4 || (shape.h == 1 && shape.w == 1 && shape.c % 4 == 0);
75 }
76
77 class Delegate {
78 struct ValueRef {
79 BHWC shape;
80 int tensor_index;
81 };
82
83 public:
Delegate(const TfLiteGpuDelegateOptions * options)84 explicit Delegate(const TfLiteGpuDelegateOptions* options) {
85 if (options) {
86 options_ = *options;
87 } else {
88 // Default options.
89 options_.metadata = nullptr;
90 options_.compile_options.precision_loss_allowed = 0;
91 options_.compile_options.preferred_gl_object_type =
92 TFLITE_GL_OBJECT_TYPE_FASTEST;
93 options_.compile_options.dynamic_batch_enabled = 0;
94 }
95 }
96
CopyFromBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor)97 absl::Status CopyFromBufferHandle(TfLiteBufferHandle handle,
98 TfLiteTensor* tensor) {
99 ValueRef ref;
100 RETURN_IF_ERROR(FindObject(handle, &ref));
101 auto buffer = phwc4_objects_.FindBuffer(handle);
102 return buffer->MappedRead<float>([&](absl::Span<const float> data) {
103 tensor->data_is_stale = false;
104 return ConvertFromPHWC4(
105 data, ref.shape,
106 absl::MakeSpan(tensor->data.f, tensor->bytes / sizeof(float)));
107 });
108 }
109
CopyToBufferHandle(TfLiteBufferHandle handle,TfLiteTensor * tensor) const110 absl::Status CopyToBufferHandle(TfLiteBufferHandle handle,
111 TfLiteTensor* tensor) const {
112 ValueRef ref;
113 RETURN_IF_ERROR(FindObject(handle, &ref));
114 auto buffer = phwc4_objects_.FindBuffer(handle);
115 return buffer->MappedWrite<float>([&](absl::Span<float> data) {
116 return ConvertToPHWC4(
117 absl::MakeConstSpan(tensor->data.f, tensor->bytes / sizeof(float)),
118 ref.shape, data);
119 });
120 }
121
BindBufferToTensor(GLuint ssbo,int tensor_index)122 absl::Status BindBufferToTensor(GLuint ssbo, int tensor_index) {
123 int64_t bytes_size;
124 RETURN_IF_ERROR(GetSSBOSize(ssbo, &bytes_size));
125 return bhwc_objects_.RegisterBuffer(
126 tensor_index, GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo, bytes_size,
127 /* offset = */ 0,
128 /* has_ownership = */ false));
129 }
130
Prepare(TfLiteContext * context,const TfLiteDelegateParams * delegate_params)131 absl::Status Prepare(TfLiteContext* context,
132 const TfLiteDelegateParams* delegate_params) {
133 // Extract TFLite delegate execution plan from the context and convert it
134 // into GraphFloat32.
135 GraphFloat32 graph;
136 RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
137
138 // Apply general transformations on the graph.
139 NullTransformationReporter reporter;
140 ModelTransformer transformer(&graph, &reporter);
141 if (!ApplyModelTransformations(&transformer)) {
142 return absl::InternalError("Graph transformations failed");
143 }
144
145 if (!env_) RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env_));
146
147 // TODO(impjdi): Remove code duplication.
148 auto values = graph.values();
149 auto find_value = [&](int tensor_index) -> Value* {
150 for (auto value : values) {
151 if (value->tensor.ref == tensor_index) return value;
152 }
153 return nullptr;
154 };
155 tensors_.reserve(values.back()->id + 1);
156 for (auto value : values) {
157 if (tensors_.size() <= value->id) {
158 tensors_.resize(value->id + 1);
159 }
160 tensors_[value->id] = {value->tensor.shape, 0};
161 }
162
163 std::unordered_set<int> tflite_graph_io; // NOLINT
164
165 // Prepare graph inputs.
166 //
167 // Note that graph.inputs() cannot be used directly, as the notion of
168 // graph input has a different meaning in public API and GPU-internal API.
169 {
170 inputs_.clear();
171 inputs_.reserve(delegate_params->input_tensors->size);
172 for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
173 const int tensor_index = delegate_params->input_tensors->data[i];
174 auto* tensor = context->tensors + tensor_index;
175 if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) {
176 continue;
177 }
178 tflite_graph_io.insert(tensor_index);
179 const auto* input = find_value(tensor_index);
180 if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
181 return absl::NotFoundError("Input tensor is not found in the graph.");
182 }
183
184 inputs_.push_back(input->id);
185 tensor->buffer_handle = input->id;
186 tensor->delegate = &delegate_;
187 tensors_[input->id].tensor_index = tensor_index;
188
189 // Create phwc4 input buffer.
190 // Check whether there is externally provided object is already in
191 // PHWC4. If yes, we may skip conversion step.
192 // We need to keep same buffer in bhwc_objects_ to indicate there is
193 // externally provided buffer.
194 auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
195 GlBuffer buffer;
196 if (IsPHWC4(input->tensor.shape) && external_buffer) {
197 buffer = external_buffer->MakeRef();
198 } else {
199 RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
200 GetElementsSizeForPHWC4(input->tensor.shape), &buffer));
201 }
202 RETURN_IF_ERROR(
203 phwc4_objects_.RegisterBuffer(input->id, std::move(buffer)));
204 }
205 }
206
207 // Prepare graph outputs.
208 //
209 // Note that graph.outputs() cannot be used directly, as the notion of
210 // graph output has a different meaning in public API and GPU-internal API.
211 {
212 outputs_.clear();
213 outputs_.reserve(delegate_params->output_tensors->size);
214 for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
215 const int tensor_index = delegate_params->output_tensors->data[i];
216 auto* tensor = context->tensors + tensor_index;
217 tflite_graph_io.insert(tensor_index);
218 const auto* output = find_value(tensor_index);
219 if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
220 return absl::NotFoundError(
221 "Output tensor is not found in the graph.");
222 }
223
224 outputs_.push_back(output->id);
225 tensor->buffer_handle = output->id;
226 tensor->delegate = &delegate_;
227 tensors_[output->id].tensor_index = tensor_index;
228
229 // Create phwc4 output buffer.
230 // Check whether there is externally provided object is already in
231 // PHWC4. If yes, we may skip conversion step.
232 auto external_buffer = bhwc_objects_.FindBuffer(tensor_index);
233 GlBuffer buffer;
234 if (IsPHWC4(output->tensor.shape) && external_buffer) {
235 buffer = external_buffer->MakeRef();
236 } else {
237 RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer<float>(
238 GetElementsSizeForPHWC4(output->tensor.shape), &buffer));
239 }
240 RETURN_IF_ERROR(
241 phwc4_objects_.RegisterBuffer(output->id, std::move(buffer)));
242 }
243 }
244
245 // Create shaders to convert from/to phwc4.
246 RETURN_IF_ERROR(ConverterBhwcToPhwc4::Create(&bhwc_to_phwc4_));
247 RETURN_IF_ERROR(ConverterPhwc4ToBhwc::Create(&phwc4_to_bhwc_));
248
249 // Compile model.
250 CompilationOptions compile_options;
251 compile_options.allow_precision_loss =
252 static_cast<bool>(options_.compile_options.precision_loss_allowed);
253 compile_options.preferred_obj_type = static_cast<ObjectType>(
254 options_.compile_options.preferred_gl_object_type);
255 compile_options.ref_obj_type = static_cast<ObjectType>(
256 options_.compile_options.preferred_gl_object_type);
257 compile_options.dynamic_batch =
258 static_cast<bool>(options_.compile_options.dynamic_batch_enabled);
259 compile_options.inline_parameters =
260 static_cast<bool>(options_.compile_options.inline_parameters);
261 auto shaders = NewNodeShaderRegistry();
262 GpuInfo gpu_info;
263 RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
264 command_queue_ = NewCommandQueue(gpu_info);
265 auto workgroups_calculator =
266 BestEffortWorkgroupsCalculator(options_.metadata, gpu_info);
267 std::unique_ptr<CompiledModel> compiled_model;
268 RETURN_IF_ERROR(Compile(compile_options, graph, tflite_graph_io, *shaders,
269 *workgroups_calculator, &compiled_model));
270
271 // Create inference context.
272 const RuntimeOptions runtime_options;
273 RETURN_IF_ERROR(compiled_model->NewRun(runtime_options, &phwc4_objects_,
274 command_queue_.get(),
275 &inference_context_));
276 return absl::OkStatus();
277 }
278
Invoke(TfLiteContext * context)279 absl::Status Invoke(TfLiteContext* context) {
280 const EGLContext egl_context_at_delegate_init = env_->context().context();
281 const EGLContext egl_context_at_delegate_invoke = eglGetCurrentContext();
282 if (egl_context_at_delegate_init != egl_context_at_delegate_invoke) {
283 return absl::FailedPreconditionError(
284 "Delegate should run on the same thread where it was initialized.");
285 }
286
287 // Push input data from a tensor to GPU.
288 for (ValueId id : inputs_) {
289 const ValueRef& ref = tensors_[id];
290 auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
291 if (external_object) {
292 // Use input from GPU.
293 // Conversion is needed only when external object is not phwc4.
294 if (!IsPHWC4(tensors_[id].shape)) {
295 RETURN_IF_ERROR(bhwc_to_phwc4_.Convert(
296 ref.shape, *external_object, command_queue_.get(),
297 phwc4_objects_.FindBuffer(id)));
298 }
299 } else {
300 // Copy from CPU to GPU
301 TfLiteTensor& tensor = context->tensors[ref.tensor_index];
302 RETURN_IF_ERROR(CopyToBufferHandle(id, &tensor));
303 }
304 }
305
306 // Run inference.
307 RETURN_IF_ERROR(inference_context_->Reset());
308 RETURN_IF_ERROR(inference_context_->Execute());
309
310 // Push output data from GPU to a tensor.
311 bool finished_gpu_processing = false;
312 for (ValueId id : outputs_) {
313 const ValueRef& ref = tensors_[id];
314 auto external_object = bhwc_objects_.FindBuffer(ref.tensor_index);
315 if (external_object) {
316 // Convert data from PHWC4 to BHWC and leave it in GPU object.
317 // Conversion is needed only when external object is not phwc4.
318 if (!IsPHWC4(tensors_[id].shape)) {
319 RETURN_IF_ERROR(
320 phwc4_to_bhwc_.Convert(ref.shape, *phwc4_objects_.FindBuffer(id),
321 command_queue_.get(), external_object));
322 }
323 } else {
324 // Wait until all GPU command are completed. This call leads to a lower
325 // processing latency because a buffer reading below will not stall if
326 // data is not yet ready.
327 if (!finished_gpu_processing) {
328 RETURN_IF_ERROR(command_queue_->WaitForCompletion());
329 finished_gpu_processing = true;
330 }
331 // Copy from GPU to CPU.
332 TfLiteTensor& tensor = context->tensors[ref.tensor_index];
333 RETURN_IF_ERROR(CopyFromBufferHandle(id, &tensor));
334 }
335 }
336 return absl::OkStatus();
337 }
338
tflite_delegate()339 TfLiteDelegate* tflite_delegate() { return &delegate_; }
340
341 private:
FindObject(ValueId id,ValueRef * ref) const342 absl::Status FindObject(ValueId id, ValueRef* ref) const {
343 if (id >= tensors_.size()) {
344 return absl::InvalidArgumentError("Invalid buffer id");
345 }
346 *ref = tensors_[id];
347 return absl::OkStatus();
348 }
349
350 TfLiteDelegate delegate_ = {
351 reinterpret_cast<void*>(this), // .data_
352 DelegatePrepare, // .Prepare
353 DelegateCopyFromBufferHandle, // .CopyFromBufferHandle
354 DelegateCopyToBufferHandle, // .CopyToBufferHandle
355 nullptr, // .FreeBufferHandle
356 kTfLiteDelegateFlagsNone, // .flags
357 };
358
359 TfLiteGpuDelegateOptions options_;
360
361 std::unique_ptr<EglEnvironment> env_;
362 std::vector<ValueRef> tensors_; // indexed by ValueId
363 std::vector<ValueId> inputs_;
364 std::vector<ValueId> outputs_;
365 ObjectManager phwc4_objects_;
366 ObjectManager bhwc_objects_; // key is tensor_index
367 ConverterPhwc4ToBhwc phwc4_to_bhwc_;
368 ConverterBhwcToPhwc4 bhwc_to_phwc4_;
369 std::unique_ptr<CommandQueue> command_queue_;
370 std::unique_ptr<InferenceContext> inference_context_;
371 };
372
GetGpuDelegate(TfLiteNode * node)373 inline Delegate* GetGpuDelegate(TfLiteNode* node) {
374 return reinterpret_cast<Delegate*>(node->user_data);
375 }
376
GetGpuDelegate(TfLiteDelegate * delegate)377 inline Delegate* GetGpuDelegate(TfLiteDelegate* delegate) {
378 return reinterpret_cast<Delegate*>(delegate->data_);
379 }
380
DelegatePrepare(TfLiteContext * context,TfLiteDelegate * delegate)381 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
382 const TfLiteRegistration kRegistration = {
383 // .init
384 [](TfLiteContext* context, const char* buffer, size_t) -> void* {
385 const auto* params =
386 reinterpret_cast<const TfLiteDelegateParams*>(buffer);
387 auto* gpu_delegate = GetGpuDelegate(params->delegate);
388 // Everything below should happen in prepare function call, but TFLite
389 // for whatever reason forbids that.
390 const auto status = gpu_delegate->Prepare(context, params);
391 if (status.ok()) return gpu_delegate;
392 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
393 std::string(status.message()).c_str());
394 return nullptr;
395 },
396 // .free
397 [](TfLiteContext*, void* buffer) -> void {},
398 // .prepare
399 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
400 return node->user_data ? kTfLiteOk : kTfLiteError;
401 },
402 // .invoke
403 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
404 const auto status = GetGpuDelegate(node)->Invoke(context);
405 if (status.ok()) return kTfLiteOk;
406 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
407 std::string(status.message()).c_str());
408 return kTfLiteError;
409 },
410 nullptr, // .profiling_string
411 0, // .builtin_code
412 "TfLiteGpuDelegate", // .custom_name
413 1, // .version
414 };
415 TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
416 const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
417 context, kRegistration, ops_to_replace, delegate);
418 TfLiteIntArrayFree(ops_to_replace);
419 return status;
420 }
421
DelegateCopyFromBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)422 TfLiteStatus DelegateCopyFromBufferHandle(TfLiteContext* context,
423 TfLiteDelegate* delegate,
424 TfLiteBufferHandle buffer_handle,
425 TfLiteTensor* tensor) {
426 auto* gpu_delegate = GetGpuDelegate(delegate);
427 if (!gpu_delegate) return kTfLiteError;
428 const auto status = gpu_delegate->CopyFromBufferHandle(buffer_handle, tensor);
429 if (status.ok()) return kTfLiteOk;
430 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate CopyFromBufferHandle: %s",
431 std::string(status.message()).c_str());
432 return kTfLiteError;
433 }
434
DelegateCopyToBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)435 TfLiteStatus DelegateCopyToBufferHandle(TfLiteContext* context,
436 TfLiteDelegate* delegate,
437 TfLiteBufferHandle buffer_handle,
438 TfLiteTensor* tensor) {
439 auto* gpu_delegate = GetGpuDelegate(delegate);
440 if (!gpu_delegate) return kTfLiteError;
441 const auto status = gpu_delegate->CopyToBufferHandle(buffer_handle, tensor);
442 if (status.ok()) return kTfLiteOk;
443 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate CopyToBufferHandle: %s",
444 std::string(status.message()).c_str());
445 return kTfLiteError;
446 }
447
448 } // namespace
449 } // namespace gl
450 } // namespace gpu
451 } // namespace tflite
452
TfLiteGlCompileOptionsDefault()453 TfLiteGlCompileOptions TfLiteGlCompileOptionsDefault() {
454 TfLiteGlCompileOptions options;
455 options.precision_loss_allowed = 0;
456 options.preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST;
457 options.dynamic_batch_enabled = 0;
458 options.inline_parameters = 0;
459 return options;
460 }
461
TfLiteGpuDelegateOptionsDefault()462 TfLiteGpuDelegateOptions TfLiteGpuDelegateOptionsDefault() {
463 TfLiteGpuDelegateOptions options;
464 options.metadata = nullptr;
465 options.compile_options = TfLiteGlCompileOptionsDefault();
466 return options;
467 }
468
TfLiteGpuDelegateCreate(const TfLiteGpuDelegateOptions * options)469 TfLiteDelegate* TfLiteGpuDelegateCreate(
470 const TfLiteGpuDelegateOptions* options) {
471 TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
472 "Created TensorFlow Lite delegate for GPU.");
473 auto* gpu_delegate = new tflite::gpu::gl::Delegate(options);
474 return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
475 }
476
TfLiteGpuDelegateDelete(TfLiteDelegate * delegate)477 void TfLiteGpuDelegateDelete(TfLiteDelegate* delegate) {
478 delete tflite::gpu::gl::GetGpuDelegate(delegate);
479 }
480
TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate * delegate,GLuint buffer,int tensor_index)481 TfLiteStatus TfLiteGpuDelegateBindBufferToTensor(TfLiteDelegate* delegate,
482 GLuint buffer,
483 int tensor_index) {
484 auto* gpu_delegate = tflite::gpu::gl::GetGpuDelegate(delegate);
485 return gpu_delegate &&
486 gpu_delegate->BindBufferToTensor(buffer, tensor_index).ok()
487 ? kTfLiteOk
488 : kTfLiteError;
489 }
490
491 #ifndef TFLITE_GPU_BINARY_RELEASE
TfLiteGpuDelegateGetModelMetadata(const void * tflite_model)492 const uint8_t* TfLiteGpuDelegateGetModelMetadata(const void* tflite_model) {
493 const auto* model = reinterpret_cast<const tflite::Model*>(tflite_model);
494 if (!model || !model->metadata_buffer() || !model->buffers()) return nullptr;
495 for (int32_t buffer_index : *model->metadata_buffer()) {
496 if (buffer_index < 0 && buffer_index >= model->buffers()->size()) continue;
497 const tflite::Buffer* buffer = model->buffers()->Get(buffer_index);
498 if (!buffer) continue;
499 const uint8_t* data = buffer->data()->data();
500 if (!flatbuffers::BufferHasIdentifier(
501 data, tflite::gpu::gl::data::FlowMetadataIdentifier())) {
502 continue;
503 }
504 flatbuffers::Verifier verifier(data, buffer->data()->size());
505 return tflite::gpu::gl::data::VerifyFlowMetadataBuffer(verifier) ? data
506 : nullptr;
507 }
508 return nullptr;
509 }
510 #endif // TFLITE_GPU_BINARY_RELEASE
511