1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 18 19 #include <cstdint> 20 #include <functional> 21 #include <map> 22 #include <memory> 23 #include <string> 24 #include <vector> 25 26 #include "absl/container/flat_hash_map.h" 27 #include "tensorflow/lite/delegates/gpu/cl/buffer.h" 28 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" 29 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h" 30 #include "tensorflow/lite/delegates/gpu/cl/environment.h" 31 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h" 32 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" 33 #include "tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h" 34 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h" 35 #include "tensorflow/lite/delegates/gpu/cl/tensor.h" 36 #include "tensorflow/lite/delegates/gpu/common/model.h" 37 #include "tensorflow/lite/delegates/gpu/common/model_hints.h" 38 #include "tensorflow/lite/delegates/gpu/common/precision.h" 39 #include "tensorflow/lite/delegates/gpu/common/status.h" 40 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h" 41 #include "tensorflow/lite/delegates/gpu/common/tensor.h" 42 43 namespace tflite { 44 namespace gpu { 45 namespace cl { 46 47 struct CLNode { 48 ClOperation cl_operation; 49 std::vector<ValueId> inputs; 50 std::vector<ValueId> outputs; 51 52 // Mostly for debug purposes. 53 std::string name; 54 55 CLNode() = default; 56 57 CLNode(CLNode&& node) = default; 58 CLNode& operator=(CLNode&& node) = default; 59 CLNode(const CLNode&) = delete; 60 CLNode& operator=(const CLNode&) = delete; 61 }; 62 63 class InferenceContext { 64 public: 65 struct CreateInferenceInfo { 66 CalculationsPrecision precision; 67 TensorStorageType storage_type; 68 ModelHints hints; 69 }; 70 71 absl::Status InitFromGraph(const CreateInferenceInfo& create_info, 72 const GraphFloat32& graph, Environment* env, 73 std::vector<uint8_t>* serialized_model = nullptr); 74 75 // Applies OpenCL-specific transformations to the graph before the 76 // initialization. These transformations are either impossible or useless in 77 // other backends. 78 absl::Status InitFromGraphWithTransforms( 79 const CreateInferenceInfo& create_info, GraphFloat32* graph, 80 Environment* env, std::vector<uint8_t>* serialized_model = nullptr); 81 82 absl::Status AddToQueue(CLCommandQueue* queue); 83 absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result); 84 // for profiling and memory statistics 85 uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const; 86 87 absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor, 88 CLCommandQueue* queue); 89 90 // It will work only with input/output tensor ids. For all other ids we don't 91 // have any guarantees. 92 Tensor* GetTensor(ValueId id); 93 94 absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue, 95 TensorFloat32* result); 96 GetInputIds()97 const std::vector<ValueId>& GetInputIds() const { return input_ids_; } GetOutputIds()98 const std::vector<ValueId>& GetOutputIds() const { return output_ids_; } 99 100 absl::Status RestoreDeserialized( 101 const absl::Span<const uint8_t> serialized_model, Environment* env); 102 103 private: 104 enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst }; 105 106 friend flatbuffers::Offset<data::InferenceContext> Encode( 107 const CLDevice& device, const InferenceContext& inference, 108 const ProgramCache& program_cache, const std::vector<int64_t>& in_refs, 109 std::vector<int64_t>& out_refs, flatbuffers::FlatBufferBuilder* builder); 110 friend absl::Status Decode(const CLContext& context, const CLDevice& device, 111 ProgramCache* program_cache, 112 const data::InferenceContext* fb_inference, 113 InferenceContext* inference); 114 115 void CopyInAndOutIds(const GraphFloat32& graph); 116 absl::Status ConvertOperations(const GpuInfo& gpu_info, 117 const GraphFloat32& graph, ModelHints hints); 118 void CreateLinks(); 119 absl::Status ReserveGraphTensors(const CreateInferenceInfo& create_info, 120 const GpuInfo& gpu_info, 121 const GraphFloat32& graph); 122 absl::Status Merge(); 123 absl::Status AllocateMemory(const GpuInfo& gpu_info, CLContext* context); 124 125 absl::Status AllocateMemoryForConstTensors(CLContext* context); 126 127 absl::Status AllocateMemoryForVariableTensors(CLContext* context); 128 129 absl::Status AllocateMemoryForBuffers(const GpuInfo& gpu_info, 130 CLContext* context); 131 132 absl::Status AllocateMemoryForStrongShapes(const GpuInfo& gpu_info, 133 CLContext* context); 134 135 // utility function 136 void GetUsages(const std::function<bool(ValueId)>& functor, 137 std::map<ValueId, int2>* usages); 138 139 TensorMemoryType GetTensorMemoryType(const GpuInfo& gpu_info, ValueId id); 140 141 void BindMemoryToOperations(); 142 absl::Status Compile(const CreationContext& creation_context); 143 absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info, 144 ProfilingCommandQueue* profiling_queue); 145 absl::Status UpdateParams(); 146 147 void InitRecordableQueue(Environment* env); 148 149 void ReleaseCPURepresentation(); 150 151 // performance hacks 152 bool need_flush_ = false; 153 154 bool flush_periodically_ = false; 155 int flush_period_ = 1; 156 157 // In order to reduce memory leak on Mali a pipeline needs to be synchronized 158 // with CPU to prevent growing internal global OpenCL kernel pool. One trick 159 // is to enqueue an event from a previous run. Most of the time is should 160 // already be executed on GPU and should not stall the pipeline. 161 bool need_manual_release_ = false; 162 CLEvent prev_enqueue_start_point_; 163 164 CalculationsPrecision precision_; 165 TensorStorageType storage_type_; 166 167 // Directly mapped nodes from graph, but some of them "inactive" due 168 // to fusion (inactive = fused). 169 // Memory is allocated only once, in ConvertOperations, and is not modified 170 // anywhere. 171 std::vector<CLNode> nodes_; 172 173 struct DummyTensor { 174 BHWC shape; 175 TensorDescriptor descriptor; 176 177 bool operator==(const DummyTensor& b) const { 178 return shape == b.shape && descriptor == b.descriptor; 179 } 180 }; 181 182 class TensorReserver { 183 public: TensorReserver()184 TensorReserver() : next_(0) {} Add(const DummyTensor & dummy)185 ValueId Add(const DummyTensor& dummy) { 186 reservations_[next_] = dummy; 187 return next_++; 188 } Add(ValueId id,const DummyTensor & dummy)189 void Add(ValueId id, const DummyTensor& dummy) { 190 reservations_[id] = dummy; 191 } SetNext(ValueId id)192 void SetNext(ValueId id) { next_ = id; } Get(ValueId id)193 DummyTensor Get(ValueId id) { return reservations_[id]; } 194 GetTensorDescs()195 std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const { 196 std::vector<std::pair<ValueId, TensorDescriptor>> result; 197 for (auto& v : reservations_) { 198 TensorDescriptor desc = v.second.descriptor; 199 desc.shape.b = v.second.shape.b; 200 desc.shape.h = v.second.shape.h; 201 desc.shape.w = v.second.shape.w; 202 desc.shape.d = 1; 203 desc.shape.c = v.second.shape.c; 204 result.push_back({v.first, desc}); 205 } 206 return result; 207 } 208 Add(const std::vector<std::pair<ValueId,TensorDescriptor>> & tensors)209 void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) { 210 for (auto& v : tensors) { 211 DummyTensor dummy; 212 dummy.descriptor = v.second; 213 dummy.shape.b = v.second.shape.b; 214 dummy.shape.h = v.second.shape.h; 215 dummy.shape.w = v.second.shape.w; 216 dummy.shape.c = v.second.shape.c; 217 Add(v.first, dummy); 218 } 219 } 220 221 private: 222 absl::flat_hash_map<ValueId, DummyTensor> reservations_; 223 ValueId next_; 224 }; 225 TensorReserver tensor_reserver_; 226 227 absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_; 228 std::map<ValueId, Tensor> const_tensors_; 229 230 std::map<ValueId, Tensor> variable_tensors_; 231 Buffer shared_buffers_parent_; 232 std::vector<Buffer> shared_buffers_; 233 std::vector<Tensor> 234 shared_buffer_tensors_; // use references to memory from shared_buffers_ 235 std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_; 236 237 std::map<ValueId, Tensor> strong_shape_tensors_; 238 std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_; 239 240 std::vector<ValueId> input_ids_; 241 std::map<ValueId, ValueId> variable_ids_and_refs_; 242 std::vector<ValueId> output_ids_; 243 244 std::unique_ptr<RecordableQueue> recordable_queue_; 245 }; 246 247 // Runs OpenCL specific transforms for the graph. 248 absl::Status RunGraphTransforms(GraphFloat32* graph); 249 250 } // namespace cl 251 } // namespace gpu 252 } // namespace tflite 253 254 #endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 255