• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
18 
19 #include <cstdint>
20 #include <functional>
21 #include <map>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "absl/container/flat_hash_map.h"
27 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
28 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
29 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
30 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
31 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
32 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
33 #include "tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h"
34 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
35 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
36 #include "tensorflow/lite/delegates/gpu/common/model.h"
37 #include "tensorflow/lite/delegates/gpu/common/model_hints.h"
38 #include "tensorflow/lite/delegates/gpu/common/precision.h"
39 #include "tensorflow/lite/delegates/gpu/common/status.h"
40 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
41 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
42 
43 namespace tflite {
44 namespace gpu {
45 namespace cl {
46 
47 struct CLNode {
48   ClOperation cl_operation;
49   std::vector<ValueId> inputs;
50   std::vector<ValueId> outputs;
51 
52   // Mostly for debug purposes.
53   std::string name;
54 
55   CLNode() = default;
56 
57   CLNode(CLNode&& node) = default;
58   CLNode& operator=(CLNode&& node) = default;
59   CLNode(const CLNode&) = delete;
60   CLNode& operator=(const CLNode&) = delete;
61 };
62 
63 class InferenceContext {
64  public:
65   struct CreateInferenceInfo {
66     CalculationsPrecision precision;
67     TensorStorageType storage_type;
68     ModelHints hints;
69   };
70 
71   absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
72                              const GraphFloat32& graph, Environment* env,
73                              std::vector<uint8_t>* serialized_model = nullptr);
74 
75   // Applies OpenCL-specific transformations to the graph before the
76   // initialization. These transformations are either impossible or useless in
77   // other backends.
78   absl::Status InitFromGraphWithTransforms(
79       const CreateInferenceInfo& create_info, GraphFloat32* graph,
80       Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
81 
82   absl::Status AddToQueue(CLCommandQueue* queue);
83   absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
84   // for profiling and memory statistics
85   uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
86 
87   absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
88                               CLCommandQueue* queue);
89 
90   // It will work only with input/output tensor ids. For all other ids we don't
91   // have any guarantees.
92   Tensor* GetTensor(ValueId id);
93 
94   absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
95                                TensorFloat32* result);
96 
GetInputIds()97   const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
GetOutputIds()98   const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
99 
100   absl::Status RestoreDeserialized(
101       const absl::Span<const uint8_t> serialized_model, Environment* env);
102 
103  private:
104   enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst };
105 
106   friend flatbuffers::Offset<data::InferenceContext> Encode(
107       const CLDevice& device, const InferenceContext& inference,
108       const ProgramCache& program_cache, const std::vector<int64_t>& in_refs,
109       std::vector<int64_t>& out_refs, flatbuffers::FlatBufferBuilder* builder);
110   friend absl::Status Decode(const CLContext& context, const CLDevice& device,
111                              ProgramCache* program_cache,
112                              const data::InferenceContext* fb_inference,
113                              InferenceContext* inference);
114 
115   void CopyInAndOutIds(const GraphFloat32& graph);
116   absl::Status ConvertOperations(const GpuInfo& gpu_info,
117                                  const GraphFloat32& graph, ModelHints hints);
118   void CreateLinks();
119   absl::Status ReserveGraphTensors(const CreateInferenceInfo& create_info,
120                                    const GpuInfo& gpu_info,
121                                    const GraphFloat32& graph);
122   absl::Status Merge();
123   absl::Status AllocateMemory(const GpuInfo& gpu_info, CLContext* context);
124 
125   absl::Status AllocateMemoryForConstTensors(CLContext* context);
126 
127   absl::Status AllocateMemoryForVariableTensors(CLContext* context);
128 
129   absl::Status AllocateMemoryForBuffers(const GpuInfo& gpu_info,
130                                         CLContext* context);
131 
132   absl::Status AllocateMemoryForStrongShapes(const GpuInfo& gpu_info,
133                                              CLContext* context);
134 
135   // utility function
136   void GetUsages(const std::function<bool(ValueId)>& functor,
137                  std::map<ValueId, int2>* usages);
138 
139   TensorMemoryType GetTensorMemoryType(const GpuInfo& gpu_info, ValueId id);
140 
141   void BindMemoryToOperations();
142   absl::Status Compile(const CreationContext& creation_context);
143   absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
144                     ProfilingCommandQueue* profiling_queue);
145   absl::Status UpdateParams();
146 
147   void InitRecordableQueue(Environment* env);
148 
149   void ReleaseCPURepresentation();
150 
151   // performance hacks
152   bool need_flush_ = false;
153 
154   bool flush_periodically_ = false;
155   int flush_period_ = 1;
156 
157   // In order to reduce memory leak on Mali a pipeline needs to be synchronized
158   // with CPU to prevent growing internal global OpenCL kernel pool. One trick
159   // is to enqueue an event from a previous run. Most of the time is should
160   // already be executed on GPU and should not stall the pipeline.
161   bool need_manual_release_ = false;
162   CLEvent prev_enqueue_start_point_;
163 
164   CalculationsPrecision precision_;
165   TensorStorageType storage_type_;
166 
167   // Directly mapped nodes from graph, but some of them "inactive" due
168   //  to fusion (inactive = fused).
169   // Memory is allocated only once, in ConvertOperations, and is not modified
170   //  anywhere.
171   std::vector<CLNode> nodes_;
172 
173   struct DummyTensor {
174     BHWC shape;
175     TensorDescriptor descriptor;
176 
177     bool operator==(const DummyTensor& b) const {
178       return shape == b.shape && descriptor == b.descriptor;
179     }
180   };
181 
182   class TensorReserver {
183    public:
TensorReserver()184     TensorReserver() : next_(0) {}
Add(const DummyTensor & dummy)185     ValueId Add(const DummyTensor& dummy) {
186       reservations_[next_] = dummy;
187       return next_++;
188     }
Add(ValueId id,const DummyTensor & dummy)189     void Add(ValueId id, const DummyTensor& dummy) {
190       reservations_[id] = dummy;
191     }
SetNext(ValueId id)192     void SetNext(ValueId id) { next_ = id; }
Get(ValueId id)193     DummyTensor Get(ValueId id) { return reservations_[id]; }
194 
GetTensorDescs()195     std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const {
196       std::vector<std::pair<ValueId, TensorDescriptor>> result;
197       for (auto& v : reservations_) {
198         TensorDescriptor desc = v.second.descriptor;
199         desc.shape.b = v.second.shape.b;
200         desc.shape.h = v.second.shape.h;
201         desc.shape.w = v.second.shape.w;
202         desc.shape.d = 1;
203         desc.shape.c = v.second.shape.c;
204         result.push_back({v.first, desc});
205       }
206       return result;
207     }
208 
Add(const std::vector<std::pair<ValueId,TensorDescriptor>> & tensors)209     void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) {
210       for (auto& v : tensors) {
211         DummyTensor dummy;
212         dummy.descriptor = v.second;
213         dummy.shape.b = v.second.shape.b;
214         dummy.shape.h = v.second.shape.h;
215         dummy.shape.w = v.second.shape.w;
216         dummy.shape.c = v.second.shape.c;
217         Add(v.first, dummy);
218       }
219     }
220 
221    private:
222     absl::flat_hash_map<ValueId, DummyTensor> reservations_;
223     ValueId next_;
224   };
225   TensorReserver tensor_reserver_;
226 
227   absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_;
228   std::map<ValueId, Tensor> const_tensors_;
229 
230   std::map<ValueId, Tensor> variable_tensors_;
231   Buffer shared_buffers_parent_;
232   std::vector<Buffer> shared_buffers_;
233   std::vector<Tensor>
234       shared_buffer_tensors_;  // use references to memory from shared_buffers_
235   std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
236 
237   std::map<ValueId, Tensor> strong_shape_tensors_;
238   std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
239 
240   std::vector<ValueId> input_ids_;
241   std::map<ValueId, ValueId> variable_ids_and_refs_;
242   std::vector<ValueId> output_ids_;
243 
244   std::unique_ptr<RecordableQueue> recordable_queue_;
245 };
246 
247 // Runs OpenCL specific transforms for the graph.
248 absl::Status RunGraphTransforms(GraphFloat32* graph);
249 
250 }  // namespace cl
251 }  // namespace gpu
252 }  // namespace tflite
253 
254 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
255