• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <memory>
17 #include <numeric>
18 #include <utility>
19 #include <vector>
20 
21 #include <gmock/gmock.h>
22 #include <gtest/gtest.h>
23 #include "absl/container/inlined_vector.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/types/span.h"
26 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
27 #include "tensorflow/cc/framework/scope.h"
28 #include "tensorflow/cc/ops/function_ops.h"
29 #include "tensorflow/cc/ops/math_ops.h"
30 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
31 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
32 #include "tensorflow/core/common_runtime/device.h"
33 #include "tensorflow/core/common_runtime/device_factory.h"
34 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
35 #include "tensorflow/core/framework/attr_value.pb.h"
36 #include "tensorflow/core/framework/fake_input.h"
37 #include "tensorflow/core/framework/function.h"
38 #include "tensorflow/core/framework/graph.pb.h"
39 #include "tensorflow/core/framework/node_def_builder.h"
40 #include "tensorflow/core/framework/op_kernel.h"
41 #include "tensorflow/core/framework/resource_mgr.h"
42 #include "tensorflow/core/framework/tensor.h"
43 #include "tensorflow/core/framework/tensor_shape.h"
44 #include "tensorflow/core/framework/types.h"
45 #include "tensorflow/core/framework/types.pb.h"
46 #include "tensorflow/core/graph/graph.h"
47 #include "tensorflow/core/kernels/ops_testutil.h"
48 #include "tensorflow/core/lib/core/status_test_util.h"
49 #include "tensorflow/core/platform/refcount.h"
50 #include "tensorflow/core/platform/status.h"
51 #include "tensorflow/core/public/version.h"
52 
53 #if GOOGLE_CUDA && GOOGLE_TENSORRT
54 
55 namespace tensorflow {
56 namespace tensorrt {
57 using ::absl::StrCat;
58 using ::testing::ElementsAre;
59 
60 class TRTEngineOpTestBase : public OpsTestBase {
61  public:
AddSimpleTrtOp(DataType dtype,int max_cached_engines_count=1,PartialTensorShape shape=PartialTensorShape ({-1, -1}),bool use_implicit_batch=true,bool allow_build_at_runtime=true)62   void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1,
63                       PartialTensorShape shape = PartialTensorShape({-1, -1}),
64                       bool use_implicit_batch = true,
65                       bool allow_build_at_runtime = true) {
66     // Create the GPU device.
67     std::unique_ptr<Device> device(
68         DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
69 
70     // Create simple TF graph.
71     Scope s = Scope::NewRootScope();
72     auto feed = ops::_Arg(s.WithOpName("TensorRTInputPH_0"), dtype, 0);
73     auto add = ops::Add(s.WithOpName("add"), feed, feed);
74     ops::_Retval(s.WithOpName("TensorRTOutputPH_0"), add, 0);
75 
76     // Serialize the graph. TRTEngineOp will convert it using dynamic mode.
77     GraphDef graph_def;
78     TF_ASSERT_OK(s.ToGraphDef(&graph_def));
79     Graph* graph = s.graph();
80     const char* op_name = "myop";
81     TF_ASSERT_OK(
82         convert::RegisterGraphToFunctionLibrary(graph_def, graph, op_name));
83     TF_ASSERT_OK(flib_def_->AddLibrary(graph->flib_def()));
84 
85     // Create the op.
86     // In implicit batch mode, the input shapes that we specify here are not
87     // used for engine creation, we use the concrete shapes during inference
88     // time for creating the engine.
89     // In explicit batch mode, the input shapes attribute is used to define
90     // the network for the TensorRT engine.
91     OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
92     NameAttrList function;
93     function.set_name(StrCat(op_name, "_native_segment"));
94     // We disable allow_soft_placement when executing the native segment of the
95     // TRTEngineOp for the following reasons:
96     //    OpsTestBase only allow one device in the device manager.
97     //    We need to define the GPU device to test TRTEngineOp.
98     //    When allow_soft_placement is true, the TensorFlow runtime produces an
99     //      error if a CPU device is not defined
100     //      (see ProcessFunctionLibraryRuntime::InstantiateMultiDevice).
101     TF_ASSERT_OK(NodeDefBuilder(op_name, "TRTEngineOp")
102                      .Input(FakeInput(1, dtype))
103                      .Attr("input_shapes", {shape})
104                      .Attr("output_shapes", {shape})
105                      .Attr("static_engine", false)
106                      .Attr("segment_func", function)
107                      .Attr("serialized_segment", "")
108                      .Attr("calibration_data", "")
109                      .Attr("max_cached_engines_count", max_cached_engines_count)
110                      .Attr("workspace_size_bytes", 1 << 20)
111                      .Attr("precision_mode", "FP32")
112                      .Attr("use_calibration", false)
113                      .Attr("_use_implicit_batch", use_implicit_batch)
114                      .Attr("_allow_build_at_runtime", allow_build_at_runtime)
115                      .Attr("_allow_soft_placement", false)
116                      .Attr("OutT", {dtype})
117                      .Finalize(OpsTestBase::node_def()));
118     TF_ASSERT_OK(InitOpWithFunctionLibrary());
119   }
120 
121   template <typename T>
AddSimpleInput(const TensorShape & shape)122   void AddSimpleInput(const TensorShape& shape) {
123     std::vector<T> input(shape.num_elements());
124     std::iota(input.begin(), input.end(), T(0));
125     OpsTestBase::AddInputFromArray<T>(shape, input);
126   }
127 
ResetInputs()128   void ResetInputs() {
129     inputs_.clear();
130     for (auto& temp : tensors_) {
131       delete temp;
132     }
133     tensors_.clear();
134   }
135 
136  private:
InitOpWithFunctionLibrary()137   Status InitOpWithFunctionLibrary() {
138     OpKernel* kernel = nullptr;
139     auto flr = pflr_->GetFLR(device_->name());
140     std::shared_ptr<const NodeProperties> props;
141     Status status = NodeProperties::CreateFromNodeDef(
142         node_def_, flr->GetFunctionLibraryDefinition(), &props);
143     if (status.ok()) {
144       status.Update(CreateOpKernel(device_type_, device_, allocator(), flr,
145                                    props, TF_GRAPH_DEF_VERSION, &kernel));
146     }
147     kernel_ = std::unique_ptr<OpKernel>(kernel);
148     if (kernel_ != nullptr) input_types_ = kernel_->input_types();
149     return status;
150   }
151 };
152 
TEST_F(TRTEngineOpTestBase,DynamicEngines)153 TEST_F(TRTEngineOpTestBase, DynamicEngines) {
154   // Test dynamic engine creation during inference time
155   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/4);
156 
157   // Execute the op with batch size > 1.
158   TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({2, 2}));
159   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
160 
161   // Get the engine cache.
162   TRTEngineCacheResource* cache_resource = nullptr;
163   TF_ASSERT_OK(
164       device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
165   core::ScopedUnref sc(cache_resource);
166 
167   // It should contain only one engine.
168   auto cache = &cache_resource->cache_;
169   EXPECT_EQ(1, cache->size());
170   EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
171 
172   // Execute the op with batch size 1. It should reuse existing engine to
173   // execute.
174   ResetInputs();
175   TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({1, 2}));
176   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
177   EXPECT_EQ(1, cache->size());
178   EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
179 
180   // Execute the op with a larger batch size.
181   ResetInputs();
182   TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({3, 2}));
183   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
184   EXPECT_EQ(2, cache->size());
185   EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
186   EXPECT_EQ(1, cache->count({TensorShape({3, 2})}));
187 
188   // Execute the op with an input that has different non-batch dimension.
189   ResetInputs();
190   TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({10, 10}));
191   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
192   // Execute it again with an input that has the same non-batch dimension but
193   // smallest batch size. It should find the correct engine to use.
194   ResetInputs();
195   TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({1, 10}));
196   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
197   EXPECT_EQ(3, cache->size());  // Should only create 3 engines in total.
198   EXPECT_EQ(1, cache->count({TensorShape({2, 2})}));
199   EXPECT_EQ(1, cache->count({TensorShape({3, 2})}));
200   EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
201 }
202 
TEST_F(TRTEngineOpTestBase,AllowBuildAtRuntime)203 TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) {
204   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
205                                       PartialTensorShape({-1, -1}),
206                                       /*use_implicit_batch=*/true,
207                                       /*allow_build_at_runtime=*/false);
208 
209   // Execute the op
210   TensorShape input_shape({2, 2});
211   TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
212   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
213 
214   // Get the engine cache.
215   TRTEngineCacheResource* cache_resource = nullptr;
216   TF_ASSERT_OK(
217       device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
218   core::ScopedUnref sc(cache_resource);
219 
220   // It should contain a placeholder with an empty cuda_engine (to mark that
221   // engine creation was not successful for the given input shape).
222   auto cache = &cache_resource->cache_;
223   EXPECT_EQ(1, cache->size());
224   ASSERT_EQ(1, cache->count({input_shape}));
225   EngineContext* ectx = cache->at({input_shape}).get();
226   EXPECT_EQ(ectx->cuda_engine, nullptr);
227 }
228 
229 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
TEST_F(TRTEngineOpTestBase,ExplicitBatch)230 TEST_F(TRTEngineOpTestBase, ExplicitBatch) {
231   // Test inference in explicit batch mode with static input shapes. Static
232   // shapes in this context means that the TensorRT knows all the input shapes
233   // during engine creation time.
234   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
235                                       /*shape=*/PartialTensorShape({1, 2}),
236                                       /*use_implicit_batch=*/false);
237 
238   TensorShape input_shape({1, 2});
239   TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
240   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
241 
242   // Get the engine cache.
243   TRTEngineCacheResource* cache_resource = nullptr;
244   TF_ASSERT_OK(
245       device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
246   core::ScopedUnref sc(cache_resource);
247 
248   auto cache = &cache_resource->cache_;
249   EXPECT_EQ(1, cache->size());
250   ASSERT_EQ(1, cache->count({input_shape}));
251   EngineContext* ectx = cache->at({input_shape}).get();
252   EXPECT_NE(ectx->cuda_engine, nullptr);
253 }
254 
TEST_F(TRTEngineOpTestBase,DynamicShapes)255 TEST_F(TRTEngineOpTestBase, DynamicShapes) {
256   // Test inference in explicit batch mode with dynamic input shapes. Dynamic
257   // shapes in this context means that some input shapes for TensorRT are
258   // unknown during engine creation time. When we create the network, the
259   // unknow shapes are repsesented as -1. Before we run inference, these shapes
260   // have to be specified by calling setBindingDimensions.
261   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
262                                       /*shape=*/PartialTensorShape({-1, -1}),
263                                       /*use_implicit_batch=*/false);
264 
265   TensorShape input_shape({1, 2});
266   TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
267 
268   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
269 
270   // Get the engine cache.
271   TRTEngineCacheResource* cache_resource = nullptr;
272   TF_ASSERT_OK(
273       device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
274   core::ScopedUnref sc(cache_resource);
275 
276   auto cache = &cache_resource->cache_;
277   EXPECT_EQ(1, cache->size());
278   ASSERT_EQ(1, cache->count({input_shape}));
279   EngineContext* ectx = cache->at({input_shape}).get();
280   EXPECT_NE(ectx->cuda_engine, nullptr);
281 }
282 
283 template <typename T>
284 class TRTEngineOpTest : public TRTEngineOpTestBase {};
285 
286 using TypeList = ::testing::Types<float, Eigen::half>;
287 TYPED_TEST_SUITE(TRTEngineOpTest, TypeList);
288 
TYPED_TEST(TRTEngineOpTest,Basic)289 TYPED_TEST(TRTEngineOpTest, Basic) {
290   TRTEngineOpTestBase::AddSimpleTrtOp(DataTypeToEnum<TypeParam>::v());
291 
292   // Execute the op.
293   OpsTestBase::AddInputFromArray<TypeParam>(TensorShape({1, 2}),
294                                             {TypeParam(0.0f), TypeParam(1.0f)});
295   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
296 
297   // Verify the result.
298   Tensor* output = OpsTestBase::GetOutput(0);
299   EXPECT_THAT(
300       absl::Span<const TypeParam>(output->template flat<TypeParam>().data(),
301                                   output->NumElements()),
302       ElementsAre(TypeParam(0.0f), TypeParam(2.0f)));
303 }
304 #endif
305 
306 }  // namespace tensorrt
307 }  // namespace tensorflow
308 
309 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
310