1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 18 19 #include <cstdint> 20 #include <functional> 21 #include <map> 22 #include <memory> 23 #include <vector> 24 25 #include "absl/container/flat_hash_map.h" 26 #include "tensorflow/lite/delegates/gpu/cl/buffer.h" 27 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" 28 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h" 29 #include "tensorflow/lite/delegates/gpu/cl/environment.h" 30 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h" 31 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" 32 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h" 33 #include "tensorflow/lite/delegates/gpu/cl/tensor.h" 34 #include "tensorflow/lite/delegates/gpu/common/model.h" 35 #include "tensorflow/lite/delegates/gpu/common/model_hints.h" 36 #include "tensorflow/lite/delegates/gpu/common/precision.h" 37 #include "tensorflow/lite/delegates/gpu/common/status.h" 38 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h" 39 #include "tensorflow/lite/delegates/gpu/common/tensor.h" 40 41 namespace tflite { 42 namespace gpu { 43 namespace cl { 44 45 struct CLNode { 46 ClOperation cl_operation; 47 std::vector<ValueId> inputs; 48 std::vector<ValueId> outputs; 49 50 // Mostly for debug purposes. 51 std::string name; 52 53 CLNode() = default; 54 55 CLNode(CLNode&& node) = default; 56 CLNode& operator=(CLNode&& node) = default; 57 CLNode(const CLNode&) = delete; 58 CLNode& operator=(const CLNode&) = delete; 59 }; 60 61 class InferenceContext { 62 public: 63 struct CreateInferenceInfo { 64 CalculationsPrecision precision; 65 TensorStorageType storage_type; 66 ModelHints hints; 67 }; 68 69 absl::Status InitFromGraph(const CreateInferenceInfo& create_info, 70 const GraphFloat32& graph, Environment* env, 71 std::vector<uint8_t>* serialized_model = nullptr); 72 73 // Applies OpenCL-specific transformations to the graph before the 74 // initialization. These transformations are either impossible or useless in 75 // other backends. 76 absl::Status InitFromGraphWithTransforms( 77 const CreateInferenceInfo& create_info, GraphFloat32* graph, 78 Environment* env, std::vector<uint8_t>* serialized_model = nullptr); 79 80 absl::Status AddToQueue(CLCommandQueue* queue); 81 absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result); 82 // for profiling and memory statistics 83 uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const; 84 85 absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor, 86 CLCommandQueue* queue); 87 88 // It will work only with input/output tensor ids. For all other ids we don't 89 // have any guarantees. 90 Tensor* GetTensor(ValueId id); 91 92 absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue, 93 TensorFloat32* result); 94 GetInputIds()95 const std::vector<ValueId>& GetInputIds() const { return input_ids_; } GetOutputIds()96 const std::vector<ValueId>& GetOutputIds() const { return output_ids_; } 97 GetInputRefs()98 const std::vector<int64_t>& GetInputRefs() const { return in_refs_; } GetOutputRefs()99 const std::vector<int64_t>& GetOutputRefs() const { return out_refs_; } 100 101 absl::Status RestoreDeserialized( 102 const absl::Span<const uint8_t> serialized_model, Environment* env); 103 104 private: 105 enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst }; 106 107 friend flatbuffers::Offset<data::InferenceContext> Encode( 108 const InferenceContext& inference, 109 flatbuffers::FlatBufferBuilder* builder); 110 friend absl::Status Decode(const data::InferenceContext* fb_inference, 111 InferenceContext* inference); 112 113 void CopyInAndOutIds(const GraphFloat32& graph); 114 absl::Status ConvertOperations(const GpuInfo& gpu_info, 115 const GraphFloat32& graph, ModelHints hints); 116 void CreateLinks(); 117 void ReserveGraphTensors(const CreateInferenceInfo& create_info, 118 const GpuInfo& gpu_info, const GraphFloat32& graph); 119 absl::Status Merge(); 120 absl::Status AllocateMemory(CLContext* context); 121 122 absl::Status AllocateMemoryForConstTensors(CLContext* context); 123 124 absl::Status AllocateMemoryForVariableTensors(CLContext* context); 125 126 absl::Status AllocateMemoryForBuffers(CLContext* context); 127 128 absl::Status AllocateMemoryForStrongShapes(CLContext* context); 129 130 // utility function 131 void GetUsages(const std::function<bool(ValueId)>& functor, 132 std::map<ValueId, int2>* usages); 133 134 TensorMemoryType GetTensorMemoryType(ValueId id); 135 136 void BindMemoryToOperations(); 137 absl::Status Compile(const CreationContext& creation_context); 138 absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info, 139 ProfilingCommandQueue* profiling_queue); 140 absl::Status UpdateParams(); 141 142 void ReleaseCPURepresentation(); 143 144 // performance hacks 145 bool need_flush_ = false; 146 147 bool flush_periodically_ = false; 148 int flush_period_ = 1; 149 150 // In order to reduce memory leak on Mali a pipeline needs to be synchronized 151 // with CPU to prevent growing internal global OpenCL kernel pool. One trick 152 // is to enqueue an event from a previous run. Most of the time is should 153 // already be executed on GPU and should not stall the pipeline. 154 bool need_manual_release_ = false; 155 CLEvent prev_enqueue_start_point_; 156 157 CalculationsPrecision precision_; 158 TensorStorageType storage_type_; 159 160 // Directly mapped nodes from graph, but some of them "inactive" due 161 // to fusion (inactive = fused). 162 // Memory is allocated only once, in ConvertOperations, and is not modified 163 // anywhere. 164 std::vector<CLNode> nodes_; 165 166 struct DummyTensor { 167 BHWC shape; 168 TensorDescriptor descriptor; 169 170 bool operator==(const DummyTensor& b) const { 171 return shape == b.shape && descriptor == b.descriptor; 172 } 173 }; 174 175 class TensorReserver { 176 public: TensorReserver()177 TensorReserver() : next_(0) {} Add(const DummyTensor & dummy)178 ValueId Add(const DummyTensor& dummy) { 179 reservations_[next_] = dummy; 180 return next_++; 181 } Add(ValueId id,const DummyTensor & dummy)182 void Add(ValueId id, const DummyTensor& dummy) { 183 reservations_[id] = dummy; 184 } SetNext(ValueId id)185 void SetNext(ValueId id) { next_ = id; } Get(ValueId id)186 DummyTensor Get(ValueId id) { return reservations_[id]; } 187 GetTensorDescs()188 std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const { 189 std::vector<std::pair<ValueId, TensorDescriptor>> result; 190 for (auto& v : reservations_) { 191 TensorDescriptor desc = v.second.descriptor; 192 desc.shape.b = v.second.shape.b; 193 desc.shape.h = v.second.shape.h; 194 desc.shape.w = v.second.shape.w; 195 desc.shape.d = 1; 196 desc.shape.c = v.second.shape.c; 197 result.push_back({v.first, desc}); 198 } 199 return result; 200 } 201 Add(const std::vector<std::pair<ValueId,TensorDescriptor>> & tensors)202 void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) { 203 for (auto& v : tensors) { 204 DummyTensor dummy; 205 dummy.descriptor = v.second; 206 dummy.shape.b = v.second.shape.b; 207 dummy.shape.h = v.second.shape.h; 208 dummy.shape.w = v.second.shape.w; 209 dummy.shape.c = v.second.shape.c; 210 Add(v.first, dummy); 211 } 212 } 213 214 private: 215 absl::flat_hash_map<ValueId, DummyTensor> reservations_; 216 ValueId next_; 217 }; 218 TensorReserver tensor_reserver_; 219 220 absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_; 221 std::map<ValueId, Tensor> const_tensors_; 222 223 std::map<ValueId, Tensor> variable_tensors_; 224 std::vector<Buffer> shared_buffers_; 225 std::vector<Tensor> 226 shared_buffer_tensors_; // use references to memory from shared_buffers_ 227 std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_; 228 229 std::map<ValueId, Tensor> strong_shape_tensors_; 230 std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_; 231 232 std::vector<ValueId> input_ids_; 233 std::map<ValueId, ValueId> variable_ids_and_refs_; 234 std::vector<ValueId> output_ids_; 235 236 // for serialization 237 std::vector<int64_t> in_refs_; 238 std::vector<int64_t> out_refs_; 239 }; 240 241 // Runs OpenCL specific transforms for the graph. 242 absl::Status RunGraphTransforms(GraphFloat32* graph); 243 244 } // namespace cl 245 } // namespace gpu 246 } // namespace tflite 247 248 #endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 249