1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_API_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_API_H_
18
19 // Usage example:
20 //
21 // // Builder is created from a model using GPU-specific parameters.
22 // std::unique_ptr<InferenceBuilder> builder = ...;
23 //
24 // // input data is coming from a texture
25 // // output data goes to CPU
26 // builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4,
27 // ObjectType::OPENGL_TEXTURE, true});
28 // builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC,
29 // ObjectType::CPU_MEMORY, false});
30 // std::unique_ptr<InferenceRunner> runner;
31 // RETURN_IF_ERROR(builder->Build(&runner)); // may take significant time.
32 // RETURN_IF_ERROR(
33 // runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format}));
34 // RETURN_IF_ERROR(runner->Run());
35
36 #include <cstdint>
37 #include <memory>
38 #include <vector>
39
40 #include "absl/types/span.h"
41 #include "absl/types/variant.h"
42 #include <CL/cl.h>
43 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
44 #include "tensorflow/lite/delegates/gpu/common/status.h"
45 #include "tensorflow/lite/delegates/gpu/common/util.h"
46 #include <vulkan/vulkan.h>
47
48 #define GL_NO_PROTOTYPES
49 #define EGL_NO_PROTOTYPES
50 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
51 #undef GL_NO_PROTOTYPES
52 #undef EGL_NO_PROTOTYPES
53
54 namespace tflite {
55 namespace gpu {
56
57 // Common abbreviations:
58 // B - batch
59 // H - height
60 // W - width
61 // C - channels
62 // D - depth := DivideRoundUp(C, 4)
63 // C4 - is the constant = 4.
64 enum class DataLayout {
65 UNKNOWN,
66 BHWC,
67 DHWC4,
68 HWDC4,
69 HDWC4,
70 };
71
72 enum class ObjectType {
73 UNKNOWN,
74 OPENGL_SSBO,
75 OPENGL_TEXTURE,
76 CPU_MEMORY,
77 OPENCL_TEXTURE,
78 OPENCL_BUFFER,
79 VULKAN_BUFFER,
80 VULKAN_TEXTURE
81 };
82
83 struct OpenGlBuffer {
84 OpenGlBuffer() = default;
OpenGlBufferOpenGlBuffer85 explicit OpenGlBuffer(GLuint new_id) : id(new_id) {}
86
87 GLuint id = GL_INVALID_INDEX;
88 };
89
90 struct OpenGlTexture {
91 OpenGlTexture() = default;
OpenGlTextureOpenGlTexture92 OpenGlTexture(GLuint new_id, GLenum new_format)
93 : id(new_id), format(new_format) {}
94
95 GLuint id = GL_INVALID_INDEX;
96 GLenum format = GL_INVALID_ENUM;
97 };
98
99 struct OpenClBuffer {
100 OpenClBuffer() = default;
OpenClBufferOpenClBuffer101 explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {}
102
103 cl_mem memobj = nullptr;
104 };
105
106 struct OpenClTexture {
107 OpenClTexture() = default;
OpenClTextureOpenClTexture108 explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {}
109
110 cl_mem memobj = nullptr;
111 // TODO(akulik): should it specify texture format?
112 };
113
114 struct VulkanBuffer {
115 VulkanBuffer() = default;
VulkanBufferVulkanBuffer116 explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
117 VkDeviceMemory memory_, VkDeviceSize offset_)
118 : buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
119
120 VkBuffer buffer;
121 VkDeviceSize size;
122 VkDeviceMemory memory;
123 VkDeviceSize offset;
124 };
125
126 struct VulkanTexture {
127 VulkanTexture() = default;
VulkanTextureVulkanTexture128 explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
129
130 VkImage image;
131 VkImageView image_view;
132 VkFormat format;
133 VkExtent3D extent;
134 VkDeviceMemory memory;
135 VkDeviceSize offset;
136 };
137
138 struct VulkanMemory {
139 VulkanMemory() = default;
VulkanMemoryVulkanMemory140 explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
141
142 VkDeviceMemory memory;
143 VkDeviceSize size;
144 VkDeviceSize offset;
145 };
146
147 struct CpuMemory {
148 CpuMemory() = default;
CpuMemoryCpuMemory149 CpuMemory(void* new_data, size_t new_size_bytes)
150 : data(new_data), size_bytes(new_size_bytes) {}
151
152 void* data = nullptr;
153 size_t size_bytes = 0;
154 };
155
156 template <typename T>
MakeCpuMemory(absl::Span<T> t)157 inline CpuMemory MakeCpuMemory(absl::Span<T> t) {
158 CpuMemory m;
159 m.data = t.data();
160 m.size_bytes = t.size() * sizeof(T);
161 return m;
162 }
163
164 template <typename T>
MakeReadableCpuMemory(absl::Span<const T> t)165 inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t) {
166 CpuMemory m;
167 m.data = const_cast<T*>(t.data());
168 m.size_bytes = t.size() * sizeof(T);
169 return m;
170 }
171
172 // Defines object representation.
173 struct ObjectDef {
174 DataType data_type = DataType::UNKNOWN;
175 DataLayout data_layout = DataLayout::UNKNOWN;
176 ObjectType object_type = ObjectType::UNKNOWN;
177
178 // If true, then object is managed externally and needs to be provided to
179 // InferenceRunner by a user before running inference.
180 //
181 // User-provided objects will not be re-used internally for any purpose to
182 // lower overall memory usage.
183 bool user_provided = false;
184
185 bool operator==(const ObjectDef& other) const {
186 return data_type == other.data_type && data_layout == other.data_layout &&
187 object_type == other.object_type &&
188 user_provided == other.user_provided;
189 }
190 };
191
192 bool IsValid(const ObjectDef& def);
193
194 struct Dimensions {
DimensionsDimensions195 Dimensions() : b(1), h(1), w(1), c(1) {}
196
DimensionsDimensions197 Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
198 : b(batch), h(height), w(width), c(channels) {}
199
dDimensions200 int32_t d() const { return DivideRoundUp(c, 4); }
201
productDimensions202 int32_t product() const { return b * h * w * c; }
203
204 bool operator==(const Dimensions& other) const {
205 return b == other.b && h == other.h && w == other.w && c == other.c;
206 }
207
208 int32_t b;
209 int32_t h;
210 int32_t w;
211 int32_t c;
212 };
213
214 // Connects tensor shape with corresponding object definition.
215 struct TensorObjectDef {
216 // Dimensions semantic is defined by corresponding DataLayout.
217 Dimensions dimensions;
218 ObjectDef object_def;
219
220 bool operator==(const TensorObjectDef& other) const {
221 return dimensions == other.dimensions && object_def == other.object_def;
222 }
223 };
224
225 // @return true if tensor object def is defined.
226 bool IsValid(const TensorObjectDef& def);
227
228 // @return the number of elements in a tensor object.
229 uint32_t NumElements(const TensorObjectDef& def);
230
231 using TensorObject =
232 absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
233 OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
234
235 // @return true if object is set and corresponding values are defined.
236 bool IsValid(const TensorObjectDef& def, const TensorObject& object);
237
238 ObjectType GetType(const TensorObject& object);
239
240 // @return true if corresponding object is set for the given type
241 bool IsObjectPresent(ObjectType type, const TensorObject& obj);
242
243 // @return true if corresponding object has already been initialized and
244 // assigned with a specific ObjectType.
245 bool IsObjectInitialized(const TensorObject& obj);
246
247 class InferenceRunner;
248
249 // Allows to inspect and change input and output definitions before a graph is
250 // prepared for the inference.
251 class InferenceBuilder {
252 public:
~InferenceBuilder()253 virtual ~InferenceBuilder() {}
254
255 // Returns inference graph inputs and outputs definitions.
256 virtual std::vector<TensorObjectDef> inputs() const = 0;
257 virtual std::vector<TensorObjectDef> outputs() const = 0;
258
259 // Sets new shape for the input if underlying implementation and graph
260 // structure allows dynamic tensors.
261 virtual absl::Status SetInputShape(int index,
262 const Dimensions& dimensions) = 0;
263
264 // Updates object definitions for the given index. Implementation may allow
265 // to use different layouts and/or data type conversions between objects
266 // defined in a graph and given objects, for example:
267 // input '0' is DataType::FLOAT32, DataLayout::BHWC.
268 // A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
269 // An implementation may allow this transformation to happen automatically
270 // under the hood.
271 virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
272 virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
SetAllInputObjectDefsTo(ObjectDef def)273 virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) {
274 auto input_defs = inputs();
275 for (int i = 0; i < input_defs.size(); ++i) {
276 RETURN_IF_ERROR(SetInputObjectDef(i, def));
277 }
278 return absl::OkStatus();
279 }
SetAllOutputObjectDefsTo(ObjectDef def)280 virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) {
281 auto output_defs = outputs();
282 for (int i = 0; i < output_defs.size(); ++i) {
283 RETURN_IF_ERROR(SetOutputObjectDef(i, def));
284 }
285 return absl::OkStatus();
286 }
287
288 // Creates new instance of the inference runner. InferenceBuilder stays valid
289 // and could be used to create another inference runner if needed.
290 //
291 // This method may take significant time to prepare new inference runner. For
292 // example, it may require to compile OpenGL shaders.
293 virtual absl::Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
294 };
295
296 // Runs prepared inference. Every object marked as external needs to be set
297 // prior calling Run method.
298 class InferenceRunner {
299 public:
~InferenceRunner()300 virtual ~InferenceRunner() {}
301
302 // Returns inference graph inputs and outputs definitions.
303 virtual std::vector<TensorObjectDef> inputs() const = 0;
304 virtual std::vector<TensorObjectDef> outputs() const = 0;
305
306 // Getters provide access to underlying objects for the given index.
307 // Setters allow to set or change external object for the given index. Note,
308 // object need to match object definition set before in InferenceBuilder.
309
310 virtual absl::Status GetInputObject(int index, TensorObject* object) = 0;
311 virtual absl::Status GetOutputObject(int index, TensorObject* object) = 0;
312 virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
313 virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
314
315 virtual absl::Status Run() = 0;
316 };
317
318 // Encapsulated compilation/runtime tradeoffs.
319 enum class InferenceUsage {
320 UNKNOWN,
321
322 // InferenceRunner will be used only once. Therefore, it is important to
323 // minimize bootstrap time as well.
324 FAST_SINGLE_ANSWER,
325
326 // Prefer maximizing the throughput. Same inference runner will be used
327 // repeatedly on different inputs.
328 SUSTAINED_SPEED,
329 };
330
331 // Defines aspects to control while instantiating a runner.
332 enum class InferencePriority {
333 UNKNOWN,
334
335 AUTO,
336
337 MIN_LATENCY,
338
339 MAX_PRECISION,
340
341 MIN_MEMORY_USAGE,
342 };
343
344 struct InferenceOptions {
345 InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED;
346
347 // Ordered priorities provide better understanding of desired semantics,
348 // where priority(n) is more important than priority(n+1).
349 // AUTO priority is needed when a single priority is the most important
350 // factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving
351 // everything else to AUTO would result in configuration that achieves maximum
352 // performance.
353 //
354 // AUTO priority can only be used when higher priorities are fully specified.
355 // For example:
356 // VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
357 // VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
358 // priority3 = AUTO
359 // INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
360 // INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
361 // priority3 = MAX_PRECISION
362 // Invalid priorities will result in error.
363 InferencePriority priority1 = InferencePriority::MAX_PRECISION;
364
365 InferencePriority priority2 = InferencePriority::AUTO;
366
367 InferencePriority priority3 = InferencePriority::AUTO;
368 };
369
370 // Returns a position number for the priority. If priority is missing,
371 // then it would return 'max num priorities + 1'.
372 int GetPosition(const InferenceOptions& options, InferencePriority p);
373
374 // Return true if options are valid.
375 bool IsValid(const InferenceOptions& options);
376
377 // Resolves AUTO priorities and specifies them explicitly.
378 // Note, no-one should assume that these mappings will not change.
379 // Technically this function is declared here for code re-use purposes and
380 // by no means it should be treated as canonical way to resolve AUTO.
381 void ResolveAutoPriority(InferenceOptions* options);
382
383 enum class PriorityImportance {
384 UNKNOWN,
385 HIGHER,
386 LOWER,
387 };
388
389 // If both p1 and p2 are not present in options, return UNKNOWN
390 // If p1 is present, but p2 is not, return HIGHER
391 // If p2 is present, but p1 is not, return LOWER
392 // If both are present, and p1 is more important, return HIGHER, otherwise,
393 // LOWER.
394 PriorityImportance GetRelativeImportance(const InferenceOptions& options,
395 InferencePriority p1,
396 InferencePriority p2);
397
398 } // namespace gpu
399 } // namespace tflite
400
401 #endif // TENSORFLOW_LITE_DELEGATES_GPU_API_H_
402