1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <memory>
17 #include <numeric>
18 #include <utility>
19 #include <vector>
20
21 #include <gmock/gmock.h>
22 #include <gtest/gtest.h>
23 #include "absl/container/inlined_vector.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/types/span.h"
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
27 #include "tensorflow/cc/framework/scope.h"
28 #include "tensorflow/cc/ops/function_ops.h"
29 #include "tensorflow/cc/ops/math_ops.h"
30 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
31 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
32 #include "tensorflow/core/common_runtime/device.h"
33 #include "tensorflow/core/common_runtime/device_factory.h"
34 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
35 #include "tensorflow/core/framework/attr_value.pb.h"
36 #include "tensorflow/core/framework/fake_input.h"
37 #include "tensorflow/core/framework/function.h"
38 #include "tensorflow/core/framework/graph.pb.h"
39 #include "tensorflow/core/framework/node_def_builder.h"
40 #include "tensorflow/core/framework/op_kernel.h"
41 #include "tensorflow/core/framework/resource_mgr.h"
42 #include "tensorflow/core/framework/tensor.h"
43 #include "tensorflow/core/framework/tensor_shape.h"
44 #include "tensorflow/core/framework/types.h"
45 #include "tensorflow/core/framework/types.pb.h"
46 #include "tensorflow/core/graph/graph.h"
47 #include "tensorflow/core/kernels/ops_testutil.h"
48 #include "tensorflow/core/lib/core/status_test_util.h"
49 #include "tensorflow/core/platform/refcount.h"
50 #include "tensorflow/core/platform/status.h"
51 #include "tensorflow/core/public/version.h"
52
53 #if GOOGLE_CUDA && GOOGLE_TENSORRT
54
55 namespace tensorflow {
56 namespace tensorrt {
57 using ::absl::StrCat;
58 using ::testing::ElementsAre;
59
60 class TRTEngineOpTestBase : public OpsTestBase {
61 public:
AddSimpleTrtOp(DataType dtype,int max_cached_engines_count=1,PartialTensorShape shape=PartialTensorShape ({-1, -1}),bool use_implicit_batch=true,bool allow_build_at_runtime=true)62 void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1,
63 PartialTensorShape shape = PartialTensorShape({-1, -1}),
64 bool use_implicit_batch = true,
65 bool allow_build_at_runtime = true) {
66 // Create the GPU device.
67 std::unique_ptr<Device> device(
68 DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
69
70 // Create simple TF graph.
71 Scope s = Scope::NewRootScope();
72 auto feed = ops::_Arg(s.WithOpName("TensorRTInputPH_0"), dtype, 0);
73 auto add = ops::Add(s.WithOpName("add"), feed, feed);
74 ops::_Retval(s.WithOpName("TensorRTOutputPH_0"), add, 0);
75
76 // Serialize the graph. TRTEngineOp will convert it using dynamic mode.
77 GraphDef graph_def;
78 TF_ASSERT_OK(s.ToGraphDef(&graph_def));
79 Graph* graph = s.graph();
80 const char* op_name = "myop";
81 TF_ASSERT_OK(
82 convert::RegisterGraphToFunctionLibrary(graph_def, graph, op_name));
83 TF_ASSERT_OK(flib_def_->AddLibrary(graph->flib_def()));
84
85 // Create the op.
86 // In implicit batch mode, the input shapes that we specify here are not
87 // used for engine creation, we use the concrete shapes during inference
88 // time for creating the engine.
89 // In explicit batch mode, the input shapes attribute is used to define
90 // the network for the TensorRT engine.
91 OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
92 NameAttrList function;
93 function.set_name(StrCat(op_name, "_native_segment"));
94 // We disable allow_soft_placement when executing the native segment of the
95 // TRTEngineOp for the following reasons:
96 // OpsTestBase only allow one device in the device manager.
97 // We need to define the GPU device to test TRTEngineOp.
98 // When allow_soft_placement is true, the TensorFlow runtime produces an
99 // error if a CPU device is not defined
100 // (see ProcessFunctionLibraryRuntime::InstantiateMultiDevice).
101 TF_ASSERT_OK(NodeDefBuilder(op_name, "TRTEngineOp")
102 .Input(FakeInput(1, dtype))
103 .Attr("input_shapes", {shape})
104 .Attr("output_shapes", {shape})
105 .Attr("static_engine", false)
106 .Attr("segment_func", function)
107 .Attr("serialized_segment", "")
108 .Attr("calibration_data", "")
109 .Attr("max_cached_engines_count", max_cached_engines_count)
110 .Attr("workspace_size_bytes", 1 << 20)
111 .Attr("precision_mode", "FP32")
112 .Attr("use_calibration", false)
113 .Attr("_use_implicit_batch", use_implicit_batch)
114 .Attr("_allow_build_at_runtime", allow_build_at_runtime)
115 .Attr("_allow_soft_placement", false)
116 .Attr("OutT", {dtype})
117 .Finalize(OpsTestBase::node_def()));
118 TF_ASSERT_OK(InitOpWithFunctionLibrary());
119 }
120
121 template <typename T>
AddSimpleInput(const TensorShape & shape)122 void AddSimpleInput(const TensorShape& shape) {
123 std::vector<T> input(shape.num_elements());
124 std::iota(input.begin(), input.end(), T(0));
125 OpsTestBase::AddInputFromArray<T>(shape, input);
126 }
127
ResetInputs()128 void ResetInputs() {
129 inputs_.clear();
130 for (auto& temp : tensors_) {
131 delete temp;
132 }
133 tensors_.clear();
134 }
135
136 private:
InitOpWithFunctionLibrary()137 Status InitOpWithFunctionLibrary() {
138 OpKernel* kernel = nullptr;
139 auto flr = pflr_->GetFLR(device_->name());
140 std::shared_ptr<const NodeProperties> props;
141 Status status = NodeProperties::CreateFromNodeDef(
142 node_def_, flr->GetFunctionLibraryDefinition(), &props);
143 if (status.ok()) {
144 status.Update(CreateOpKernel(device_type_, device_, allocator(), flr,
145 props, TF_GRAPH_DEF_VERSION, &kernel));
146 }
147 kernel_ = std::unique_ptr<OpKernel>(kernel);
148 if (kernel_ != nullptr) input_types_ = kernel_->input_types();
149 return status;
150 }
151 };
152
TEST_F(TRTEngineOpTestBase,DynamicEngines)153 TEST_F(TRTEngineOpTestBase, DynamicEngines) {
154 // Test dynamic engine creation during inference time
155 TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/4);
156
157 // Execute the op with batch size > 1.
158 TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({2, 2}));
159 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
160
161 // Get the engine cache.
162 TRTEngineCacheResource* cache_resource = nullptr;
163 TF_ASSERT_OK(
164 device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
165 core::ScopedUnref sc(cache_resource);
166
167 // It should contain only one engine.
168 auto cache = &cache_resource->cache_;
169 EXPECT_EQ(1, cache->size());
170 EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
171
172 // Execute the op with batch size 1. It should reuse existing engine to
173 // execute.
174 ResetInputs();
175 TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({1, 2}));
176 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
177 EXPECT_EQ(1, cache->size());
178 EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
179
180 // Execute the op with a larger batch size.
181 ResetInputs();
182 TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({3, 2}));
183 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
184 EXPECT_EQ(2, cache->size());
185 EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
186 EXPECT_EQ(1, cache->count({TensorShape({3, 2})}));
187
188 // Execute the op with an input that has different non-batch dimension.
189 ResetInputs();
190 TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({10, 10}));
191 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
192 // Execute it again with an input that has the same non-batch dimension but
193 // smallest batch size. It should find the correct engine to use.
194 ResetInputs();
195 TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({1, 10}));
196 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
197 EXPECT_EQ(3, cache->size()); // Should only create 3 engines in total.
198 EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
199 EXPECT_EQ(1, cache->count({TensorShape({3, 2})}));
200 EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
201 }
202
TEST_F(TRTEngineOpTestBase,AllowBuildAtRuntime)203 TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) {
204 TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
205 PartialTensorShape({-1, -1}),
206 /*use_implicit_batch=*/true,
207 /*allow_build_at_runtime=*/false);
208
209 // Execute the op
210 TensorShape input_shape({2, 2});
211 TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
212 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
213
214 // Get the engine cache.
215 TRTEngineCacheResource* cache_resource = nullptr;
216 TF_ASSERT_OK(
217 device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
218 core::ScopedUnref sc(cache_resource);
219
220 // It should contain a placeholder with an empty cuda_engine (to mark that
221 // engine creation was not successful for the given input shape).
222 auto cache = &cache_resource->cache_;
223 EXPECT_EQ(1, cache->size());
224 ASSERT_EQ(1, cache->count({input_shape}));
225 EngineContext* ectx = cache->at({input_shape}).get();
226 EXPECT_EQ(ectx->cuda_engine, nullptr);
227 }
228
229 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
TEST_F(TRTEngineOpTestBase,ExplicitBatch)230 TEST_F(TRTEngineOpTestBase, ExplicitBatch) {
231 // Test inference in explicit batch mode with static input shapes. Static
232 // shapes in this context means that the TensorRT knows all the input shapes
233 // during engine creation time.
234 TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
235 /*shape=*/PartialTensorShape({1, 2}),
236 /*use_implicit_batch=*/false);
237
238 TensorShape input_shape({1, 2});
239 TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
240 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
241
242 // Get the engine cache.
243 TRTEngineCacheResource* cache_resource = nullptr;
244 TF_ASSERT_OK(
245 device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
246 core::ScopedUnref sc(cache_resource);
247
248 auto cache = &cache_resource->cache_;
249 EXPECT_EQ(1, cache->size());
250 ASSERT_EQ(1, cache->count({input_shape}));
251 EngineContext* ectx = cache->at({input_shape}).get();
252 EXPECT_NE(ectx->cuda_engine, nullptr);
253 }
254
TEST_F(TRTEngineOpTestBase,DynamicShapes)255 TEST_F(TRTEngineOpTestBase, DynamicShapes) {
256 // Test inference in explicit batch mode with dynamic input shapes. Dynamic
257 // shapes in this context means that some input shapes for TensorRT are
258 // unknown during engine creation time. When we create the network, the
259 // unknow shapes are repsesented as -1. Before we run inference, these shapes
260 // have to be specified by calling setBindingDimensions.
261 TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
262 /*shape=*/PartialTensorShape({-1, -1}),
263 /*use_implicit_batch=*/false);
264
265 TensorShape input_shape({1, 2});
266 TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
267
268 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
269
270 // Get the engine cache.
271 TRTEngineCacheResource* cache_resource = nullptr;
272 TF_ASSERT_OK(
273 device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
274 core::ScopedUnref sc(cache_resource);
275
276 auto cache = &cache_resource->cache_;
277 EXPECT_EQ(1, cache->size());
278 ASSERT_EQ(1, cache->count({input_shape}));
279 EngineContext* ectx = cache->at({input_shape}).get();
280 EXPECT_NE(ectx->cuda_engine, nullptr);
281 }
282
283 template <typename T>
284 class TRTEngineOpTest : public TRTEngineOpTestBase {};
285
286 using TypeList = ::testing::Types<float, Eigen::half>;
287 TYPED_TEST_SUITE(TRTEngineOpTest, TypeList);
288
TYPED_TEST(TRTEngineOpTest,Basic)289 TYPED_TEST(TRTEngineOpTest, Basic) {
290 TRTEngineOpTestBase::AddSimpleTrtOp(DataTypeToEnum<TypeParam>::v());
291
292 // Execute the op.
293 OpsTestBase::AddInputFromArray<TypeParam>(TensorShape({1, 2}),
294 {TypeParam(0.0f), TypeParam(1.0f)});
295 TF_ASSERT_OK(OpsTestBase::RunOpKernel());
296
297 // Verify the result.
298 Tensor* output = OpsTestBase::GetOutput(0);
299 EXPECT_THAT(
300 absl::Span<const TypeParam>(output->template flat<TypeParam>().data(),
301 output->NumElements()),
302 ElementsAre(TypeParam(0.0f), TypeParam(2.0f)));
303 }
304 #endif
305
306 } // namespace tensorrt
307 } // namespace tensorflow
308
309 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
310