1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ 17 #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ 18 19 #include <string> 20 #include <vector> 21 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/tensor.h" 24 #include "tensorflow/core/framework/tensor_shape.h" 25 #include "tensorflow/core/lib/core/status.h" 26 #include "tensorflow/stream_executor/lib/statusor.h" 27 28 #if GOOGLE_CUDA && GOOGLE_TENSORRT 29 #include "third_party/tensorrt/NvInfer.h" 30 31 namespace tensorflow { 32 namespace tensorrt { 33 using ::stream_executor::port::StatusOr; 34 35 // Input/output data format for OpConverterTest::BuildAndRun(). 36 struct InputOutputData { BufferInputOutputData37 void* Buffer() const { 38 return const_cast<char*>(tensor.tensor_data().data()); 39 } 40 TotalBytesInputOutputData41 size_t TotalBytes() const { return tensor.TotalBytes(); } 42 43 string name; 44 Tensor tensor; 45 }; 46 47 class TRTBaseAllocator; 48 49 // Keeps track of the TensorRT execution context and the device memory owned by 50 // the context, if any. An execution context owns the device memory that TF-TRT 51 // allocates for the context. In this case, the allocator is not null and is 52 // used to free the device memory. An execution context doesn't own a device 53 // memory (1) if the device memory is allocated through TensorRT, or (2) the 54 // device memory is allocated by TF-TRT for another execution context but 55 // shared with this context. If this case, the device memory is null. 56 // 57 // Currently, the main reason we want to allocate the device memory for an 58 // execution context in TF-TRT is because the TensorRT API to create an 59 // execution context with device memory doesn't handle out of memory properly. 60 // 61 // To support dynamic shapes, we create multiple execution contexts for an 62 // engine and may want to support multiple execution contexts sharing the same 63 // device memory. 64 class ExecutionContext { 65 private: 66 // Records the TensorRT execution context `context`, the device memory 67 // `device_memory` TF-TRT allocates for the context and the device memory 68 // allocator `allocator` used to allocate the memory. If TF-TRT doesn't 69 // allocate any device memory for the context, then `device_memory` is null. 70 // otherwise, allocator should not be null. ExecutionContext(TRTBaseAllocator * allocator,void * device_memory,nvinfer1::IExecutionContext * context)71 ExecutionContext(TRTBaseAllocator* allocator, void* device_memory, 72 nvinfer1::IExecutionContext* context) 73 : memory_allocator_(allocator), 74 device_memory_(device_memory), 75 execution_context_(context) {} 76 77 public: 78 // Disables copy constructors as the object owns the device memory and the 79 // execution context. 80 ExecutionContext(const ExecutionContext&) = delete; 81 ExecutionContext& operator=(const ExecutionContext&) = delete; 82 ExecutionContext(ExecutionContext && other)83 ExecutionContext(ExecutionContext&& other) 84 : memory_allocator_(other.memory_allocator_), 85 device_memory_(other.device_memory_), 86 execution_context_(other.execution_context_) { 87 other.memory_allocator_ = nullptr; 88 other.device_memory_ = nullptr; 89 other.execution_context_ = nullptr; 90 } 91 92 ~ExecutionContext(); 93 94 operator nvinfer1::IExecutionContext*() const { return execution_context_; } GetIExecutionContext()95 nvinfer1::IExecutionContext* GetIExecutionContext() const { 96 return execution_context_; 97 } 98 99 static StatusOr<ExecutionContext> Create(nvinfer1::ICudaEngine* cuda_engine, 100 TRTBaseAllocator* allocator); 101 102 private: 103 // The allocator used to allocate and free the device memory owned by the 104 // execution context. 105 TRTBaseAllocator* memory_allocator_; 106 // The device memory owned by the execution context. 107 void* device_memory_; 108 // The TensorRT execution context. 109 nvinfer1::IExecutionContext* execution_context_; 110 }; 111 112 // Creates a TensorRT execution context. If an allocator is not given, then the 113 // execution context is created with device memory allocated by TensorRT. 114 // Otherwise, uses the allocator to allocate the needed device memory for the 115 // execution context. 116 // 117 // Returns an ExecutionContext object that wraps the above results. If out of 118 // device memory happens, returns an error status instead. 119 StatusOr<ExecutionContext> CreateExecutionContext( 120 nvinfer1::ICudaEngine* cuda_engine, TRTBaseAllocator* allocator); 121 122 using DataVec = std::vector<InputOutputData>; 123 124 // Gets the binding index of a tensor in an engine. 125 // 126 // The binding index is looked up using the tensor's name and the profile index. 127 // Profile index should be set to zero, if we do not have optimization profiles. 128 Status GetTrtBindingIndex(const char* tensor_name, int profile_index, 129 const nvinfer1::ICudaEngine* cuda_engine, 130 int* binding_index); 131 132 // Sets input buffers for TRT from a list of input tensors. The input tensors 133 // are either defined by ctx or by input_vec. 134 Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine, 135 nvinfer1::IExecutionContext* execution_context, 136 const int trt_profile_idx, 137 std::vector<void*>& buffers, bool use_implicit_batch, 138 int num_batch, OpKernelContext* ctx = nullptr, 139 const DataVec* input_vec = nullptr); 140 141 // Returns the shape of a binding from TensorRT. 142 // 143 // The binding is identified by its binding_index. The batch_size argument is 144 // ignored if use_implicit_batch==false. The shape is returned in the last 145 // argument. 146 Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine, 147 const nvinfer1::IExecutionContext* execution_context, 148 int binding_index, bool use_implicit_batch, 149 int batch_size, TensorShape& shape); 150 151 // Defines output buffers for TRT. The buffers are allocated by ctx, if ctx is 152 // not null. Otherwise it is expected that the outputs DataVec is not null, and 153 // the Tensors in outputs are already allocated. 154 Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine, 155 nvinfer1::IExecutionContext* execution_context, 156 int trt_profile_idx, std::vector<void*>& buffers, 157 bool use_implicit_batch, int batch_size = 0, 158 OpKernelContext* ctx = nullptr, 159 DataVec* outputs = nullptr); 160 161 // Enqueues TensorRT inference job. The batch_size argument is only relevant in 162 // implicit batch mode. 163 Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context, 164 std::vector<void*>& buffers, cudaStream_t stream, 165 bool use_implicit_batch, int batch_size = 1); 166 167 } // namespace tensorrt 168 } // namespace tensorflow 169 170 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT 171 172 #endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ 173