1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/delegate.h"
17
18 #include <cstdint>
19 #include <memory>
20 #include <thread> // NOLINT(build/c++11)
21 #include <vector>
22
23 #include "absl/container/flat_hash_map.h"
24 #include "absl/memory/memory.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/lite/builtin_ops.h"
27 #include "tensorflow/lite/c/common.h"
28 #include "tensorflow/lite/delegates/gpu/api.h"
29 #include "tensorflow/lite/delegates/gpu/cl/api.h"
30 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
31 #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
32 #include "tensorflow/lite/delegates/gpu/common/model.h"
33 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
34 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
35 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
36 #include "tensorflow/lite/delegates/gpu/common/status.h"
37 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
38 #include "tensorflow/lite/minimal_logging.h"
39
40 #ifndef CL_DELEGATE_NO_GL
41 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
42 #endif
43
44 namespace tflite {
45 namespace gpu {
46 namespace {
47
ToPriority(int32_t priority)48 InferencePriority ToPriority(int32_t priority) {
49 switch (priority) {
50 case TFLITE_GPU_INFERENCE_PRIORITY_AUTO:
51 return InferencePriority::AUTO;
52 case TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
53 return InferencePriority::MAX_PRECISION;
54 case TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
55 return InferencePriority::MIN_LATENCY;
56 case TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE:
57 return InferencePriority::MIN_MEMORY_USAGE;
58 }
59 return InferencePriority::UNKNOWN;
60 }
61
ToUsage(int32_t usage)62 InferenceUsage ToUsage(int32_t usage) {
63 switch (usage) {
64 case TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER:
65 return InferenceUsage::FAST_SINGLE_ANSWER;
66 case TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED:
67 return InferenceUsage::SUSTAINED_SPEED;
68 }
69 return InferenceUsage::UNKNOWN;
70 }
71
72 // Forward declarations.
73 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
74
75 class Delegate {
76 public:
Delegate(const TfLiteGpuDelegateOptionsV2 * options)77 explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
78 : num_delegate_kernels_(0) {
79 options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
80 if (options_.max_delegated_partitions <= 0) {
81 options_.max_delegated_partitions = 1;
82 }
83 }
84
tflite_delegate()85 TfLiteDelegate* tflite_delegate() { return &delegate_; }
options() const86 const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
87
IsQuantOpsAllowed() const88 bool IsQuantOpsAllowed() const {
89 return options_.experimental_flags &
90 TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
91 }
MaxDelegatedPartitions() const92 int MaxDelegatedPartitions() const {
93 return options_.max_delegated_partitions;
94 }
num_delegate_kernels() const95 int num_delegate_kernels() const { return num_delegate_kernels_; }
96
97 private:
98 TfLiteDelegate delegate_ = {
99 .data_ = reinterpret_cast<void*>(this),
100 .Prepare = DelegatePrepare,
101 .CopyFromBufferHandle = nullptr,
102 .CopyToBufferHandle = nullptr,
103 .FreeBufferHandle = nullptr,
104 .flags = kTfLiteDelegateFlagsNone,
105 };
106
107 TfLiteGpuDelegateOptionsV2 options_;
108 int num_delegate_kernels_ = 0;
109
110 friend class DelegateKernel;
111 };
112
113 // Represent the execution of a subset of nodes on GPU.
114 class DelegateKernel {
115 public:
DelegateKernel(Delegate * delegate)116 explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
117 ++delegate_->num_delegate_kernels_;
118 }
~DelegateKernel()119 ~DelegateKernel() { --delegate_->num_delegate_kernels_; }
120
Prepare(TfLiteContext * context,const TfLiteDelegateParams * delegate_params)121 absl::Status Prepare(TfLiteContext* context,
122 const TfLiteDelegateParams* delegate_params) {
123 thread_id_prepare_ = std::this_thread::get_id();
124
125 // Extract TFLite delegate execution plan from the context and convert it
126 // into GraphFloat32.
127 GraphFloat32 graph;
128 std::vector<uint32_t> input_refs;
129 std::vector<uint32_t> output_refs;
130 RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph,
131 &input_refs, &output_refs));
132
133 std::unique_ptr<InferenceBuilder> builder;
134 bool graph_is_destroyed;
135 const int experimental_flags = delegate_->options().experimental_flags;
136 if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
137 RETURN_IF_ERROR(
138 InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
139 } else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
140 RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
141 } else {
142 // By default, we try CL first & fall back to GL if that fails.
143 absl::Status status =
144 InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
145 if (!status.ok()) {
146 TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
147 TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
148
149 // Graph needs to be re-created because it is moved above.
150 GraphFloat32 graph2;
151 if (graph_is_destroyed) {
152 RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
153 &input_refs, &output_refs));
154 }
155 RETURN_IF_ERROR(InitializeOpenGlApi(
156 graph_is_destroyed ? &graph2 : &graph, &builder));
157 }
158 }
159
160 // At this point tflite didn't allocate tensors yet, therefore, collect
161 // indices and set all input and output tensors from tflite later.
162 input_indices_.reserve(input_refs.size());
163 for (uint32_t tensor_index : input_refs) {
164 const int64_t object_index = input_indices_.size();
165 input_indices_.push_back(tensor_index);
166 RETURN_IF_ERROR(
167 builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
168 }
169 output_indices_.reserve(output_refs.size());
170 for (uint32_t tensor_index : output_refs) {
171 const int64_t object_index = output_indices_.size();
172 output_indices_.push_back(tensor_index);
173 RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
174 GetObjectDef(tensor_index)));
175 }
176
177 return builder->Build(&runner_);
178 }
179
180 // This directs the runtime to allocate memory for input/output temporary
181 // tensors that require dequantization/quantization.
GetRequiredTemporaries(TfLiteContext * context,TfLiteNode * node,TfLiteIntArray ** temporaries_array_ptr)182 absl::Status GetRequiredTemporaries(TfLiteContext* context, TfLiteNode* node,
183 TfLiteIntArray** temporaries_array_ptr) {
184 if (quant_conversion_map_.empty()) return absl::OkStatus();
185
186 std::vector<int> temporary_tensors;
187 for (auto index : input_indices_) {
188 if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
189 temporary_tensors.push_back(index);
190 }
191 }
192 for (auto index : output_indices_) {
193 if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
194 temporary_tensors.push_back(index);
195 }
196 }
197 *temporaries_array_ptr = TfLiteIntArrayCreate(temporary_tensors.size());
198 for (int i = 0; i < temporary_tensors.size(); ++i) {
199 (*temporaries_array_ptr)->data[i] = temporary_tensors[i];
200 }
201 return absl::OkStatus();
202 }
203
Invoke(TfLiteContext * context)204 absl::Status Invoke(TfLiteContext* context) {
205 if (thread_id_prepare_ != std::this_thread::get_id()) {
206 TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
207 "GpuDelegate invoke thread != prepare thread");
208 if (enforce_same_thread_) {
209 return absl::FailedPreconditionError(
210 "GpuDelegate must run on the same thread where it was "
211 "initialized.");
212 }
213 }
214
215 const bool is_dequant_required = !quant_conversion_map_.empty();
216 if (is_dequant_required) {
217 RETURN_IF_ERROR(
218 DequantizeInputs(context, input_indices_, quant_conversion_map_));
219 }
220 RETURN_IF_ERROR(SetInputsAndOutputs(context));
221 RETURN_IF_ERROR(runner_->Run());
222 if (is_dequant_required) {
223 RETURN_IF_ERROR(
224 QuantizeOutputs(context, output_indices_, quant_conversion_map_));
225 }
226 return absl::OkStatus();
227 }
228
229 private:
SetInputsAndOutputs(TfLiteContext * context)230 absl::Status SetInputsAndOutputs(TfLiteContext* context) {
231 for (int i = 0; i < input_indices_.size(); ++i) {
232 RETURN_IF_ERROR(runner_->SetInputObject(
233 i, GetTensorObject(input_indices_[i], context)));
234 }
235 for (int i = 0; i < output_indices_.size(); ++i) {
236 RETURN_IF_ERROR(runner_->SetOutputObject(
237 i, GetTensorObject(output_indices_[i], context)));
238 }
239 return absl::OkStatus();
240 }
241
GetObjectDef(int index) const242 ObjectDef GetObjectDef(int index) const {
243 ObjectDef default_object_def;
244 default_object_def.data_type = DataType::FLOAT32;
245 default_object_def.data_layout = DataLayout::BHWC;
246 default_object_def.object_type = ObjectType::CPU_MEMORY;
247 default_object_def.user_provided = true;
248 return default_object_def;
249 }
250
GetTensorObject(int index,TfLiteContext * context) const251 TensorObject GetTensorObject(int index, TfLiteContext* context) const {
252 auto& tensor = context->tensors[index];
253 return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
254 }
255
256 private:
InitializeGraph(TfLiteContext * context,const TfLiteDelegateParams * delegate_params,GraphFloat32 * graph,std::vector<uint32_t> * input_refs,std::vector<uint32_t> * output_refs)257 absl::Status InitializeGraph(TfLiteContext* context,
258 const TfLiteDelegateParams* delegate_params,
259 GraphFloat32* graph,
260 std::vector<uint32_t>* input_refs,
261 std::vector<uint32_t>* output_refs) {
262 quant_conversion_map_.clear();
263 if (delegate_->IsQuantOpsAllowed()) {
264 RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
265 &quant_conversion_map_));
266 } else {
267 RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph));
268 }
269
270 input_refs->clear();
271 output_refs->clear();
272 const auto inputs = graph->inputs();
273 input_refs->reserve(inputs.size());
274 for (const auto& input : inputs) {
275 input_refs->push_back(input->tensor.ref);
276 }
277 const auto outputs = graph->outputs();
278 output_refs->reserve(outputs.size());
279 for (const auto& output : outputs) {
280 output_refs->push_back(output->tensor.ref);
281 }
282
283 return absl::OkStatus();
284 }
285
InitializeOpenClApi(GraphFloat32 * graph,std::unique_ptr<InferenceBuilder> * builder,bool * graph_is_destroyed)286 absl::Status InitializeOpenClApi(GraphFloat32* graph,
287 std::unique_ptr<InferenceBuilder>* builder,
288 bool* graph_is_destroyed) {
289 *graph_is_destroyed = false;
290 cl::InferenceEnvironmentOptions env_options;
291 cl::InferenceEnvironmentProperties properties;
292 RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
293 &properties));
294 auto delegate_options = delegate_->options();
295 cl::InferenceOptions options;
296 // If is_precision_loss_allowed == -1, then just use priorities instead
297 // of paying attention to is_precision_loss_allowed value.
298 if (delegate_options.is_precision_loss_allowed == -1) {
299 options.priority1 = ToPriority(delegate_options.inference_priority1);
300 options.priority2 = ToPriority(delegate_options.inference_priority2);
301 options.priority3 = ToPriority(delegate_options.inference_priority3);
302 } else {
303 // Users set is_precision_loss_allowed explicitly, thus use it explicitly.
304 if (delegate_options.is_precision_loss_allowed == 0) {
305 options.priority1 = InferencePriority::MAX_PRECISION;
306 } else {
307 options.priority1 = InferencePriority::MIN_LATENCY;
308 }
309 }
310 options.usage = ToUsage(delegate_options.inference_preference);
311 *graph_is_destroyed = true;
312 RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
313 options, std::move(*graph), builder));
314 TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
315 "Initialized OpenCL-based API.");
316 return absl::OkStatus();
317 }
318
InitializeOpenGlApi(GraphFloat32 * graph,std::unique_ptr<InferenceBuilder> * builder)319 absl::Status InitializeOpenGlApi(GraphFloat32* graph,
320 std::unique_ptr<InferenceBuilder>* builder) {
321 #ifndef CL_DELEGATE_NO_GL
322 gl::InferenceEnvironmentOptions env_options;
323 gl::InferenceEnvironmentProperties properties;
324 RETURN_IF_ERROR(
325 NewInferenceEnvironment(env_options, &gl_environment_, &properties));
326 auto delegate_options = delegate_->options();
327 gl::InferenceOptions options;
328 options.usage = ToUsage(delegate_options.inference_preference);
329 options.priority1 = ToPriority(delegate_options.inference_priority1);
330 options.priority2 = ToPriority(delegate_options.inference_priority2);
331 options.priority3 = ToPriority(delegate_options.inference_priority3);
332 RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
333 options, builder));
334 enforce_same_thread_ = true;
335 TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
336 "Initialized OpenGL-based API.");
337 #endif
338 return absl::OkStatus();
339 }
340
341 // The Delegate instance that's shared across all DelegateKernel instances.
342 Delegate* const delegate_; // doesn't own the memory.
343 std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
344 #ifndef CL_DELEGATE_NO_GL
345 std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
346 #endif
347 std::unique_ptr<InferenceRunner> runner_;
348 std::vector<int64_t> input_indices_;
349 std::vector<int64_t> output_indices_;
350 // Whenever quantized inference is enabled, this maps the tensor index of each
351 // originally quantized (8-bit) tensor to its float version added in
352 // model_builder - and vice versa.
353 absl::flat_hash_map<int, int> quant_conversion_map_;
354 std::thread::id thread_id_prepare_; // thread id used for Prapare()
355 bool enforce_same_thread_ = false; // flag to enforce same thread for Invoke
356 };
357
GetDelegateKernel(TfLiteNode * node)358 inline DelegateKernel* GetDelegateKernel(TfLiteNode* node) {
359 return reinterpret_cast<DelegateKernel*>(node->user_data);
360 }
361
GetDelegate(TfLiteDelegate * delegate)362 inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
363 return reinterpret_cast<Delegate*>(delegate->data_);
364 }
365
DelegatePrepare(TfLiteContext * context,TfLiteDelegate * delegate)366 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
367 const TfLiteRegistration kRegistration = {
368 // .init
369 [](TfLiteContext* context, const char* buffer, size_t) -> void* {
370 const auto* params =
371 reinterpret_cast<const TfLiteDelegateParams*>(buffer);
372 auto* gpu_delegate = GetDelegate(params->delegate);
373 // Everything below should happen in prepare function call, but TFLite
374 // for whatever reason forbids that.
375 auto gpu_delegate_kernel =
376 absl::make_unique<DelegateKernel>(gpu_delegate);
377 const auto status = gpu_delegate_kernel->Prepare(context, params);
378 if (!status.ok()) {
379 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Init: %s",
380 std::string(status.message()).c_str());
381 return nullptr;
382 }
383 return gpu_delegate_kernel.release();
384 },
385 // .free
386 [](TfLiteContext*, void* buffer) -> void {
387 delete reinterpret_cast<DelegateKernel*>(buffer);
388 },
389 // .prepare
390 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
391 if (!node->user_data) {
392 TF_LITE_KERNEL_LOG(
393 context,
394 "TfLiteGpuDelegate Prepare: delegate is not initialized");
395 return kTfLiteError;
396 }
397 auto* gpu_delegate_kernel = GetDelegateKernel(node);
398 const auto status = gpu_delegate_kernel->GetRequiredTemporaries(
399 context, node, &node->temporaries);
400 if (!status.ok()) {
401 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
402 std::string(status.message()).c_str());
403 return kTfLiteError;
404 }
405 // TODO(akulik): tflite tensors are not allocated here either. It would
406 // be good to set inputs and outputs only once here instead of setting
407 // them every time in .invoke.
408 return kTfLiteOk;
409 },
410 // .invoke
411 [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
412 const auto status = GetDelegateKernel(node)->Invoke(context);
413 if (!status.ok()) {
414 TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
415 std::string(status.message()).c_str());
416 return kTfLiteError;
417 }
418 return kTfLiteOk;
419 },
420 nullptr, // .profiling_string
421 0, // .builtin_code
422 "TfLiteGpuDelegateV2", // .custom_name
423 1, // .version
424 };
425
426 auto* gpu_delegate = GetDelegate(delegate);
427 TfLiteIntArray* ops_to_replace =
428 GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
429 gpu_delegate->MaxDelegatedPartitions());
430 const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
431 context, kRegistration, ops_to_replace, delegate);
432 TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
433 gpu_delegate->num_delegate_kernels());
434 TfLiteIntArrayFree(ops_to_replace);
435 return status;
436 }
437
438 } // namespace
439 } // namespace gpu
440 } // namespace tflite
441
TfLiteGpuDelegateOptionsV2Default()442 TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
443 TfLiteGpuDelegateOptionsV2 options = {
444 // set it to -1 to detect whether it was later adjusted.
445 .is_precision_loss_allowed = -1,
446 .inference_preference =
447 TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
448 .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
449 .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
450 .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
451 .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
452 .max_delegated_partitions = 1,
453 };
454 return options;
455 }
456
TfLiteGpuDelegateV2Create(const TfLiteGpuDelegateOptionsV2 * options)457 TfLiteDelegate* TfLiteGpuDelegateV2Create(
458 const TfLiteGpuDelegateOptionsV2* options) {
459 auto* gpu_delegate = new tflite::gpu::Delegate(options);
460 TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
461 "Created TensorFlow Lite delegate for GPU.");
462 return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
463 }
464
TfLiteGpuDelegateV2Delete(TfLiteDelegate * delegate)465 void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate) {
466 delete tflite::gpu::GetDelegate(delegate);
467 }
468