1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
17
18 #include <algorithm>
19 #include <cmath>
20 #include <cstring>
21 #include <map>
22 #include <memory>
23 #include <set>
24 #include <unordered_map>
25 #include <utility>
26 #include <vector>
27
28 #include "absl/algorithm/container.h"
29 #include "absl/container/flat_hash_set.h"
30 #include "absl/memory/memory.h"
31 #include "absl/strings/match.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_format.h"
34 #include "absl/strings/string_view.h"
35 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
36 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
37 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
38 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
39 #include "tensorflow/core/common_runtime/graph_constructor.h"
40 #include "tensorflow/core/framework/node_def.pb.h" // NOLINT
41 #include "tensorflow/core/framework/node_def_builder.h"
42 #include "tensorflow/core/framework/tensor.pb.h" // NOLINT
43 #include "tensorflow/core/framework/tensor_shape.h"
44 #include "tensorflow/core/framework/tensor_shape.pb.h" // NOLINT
45 #include "tensorflow/core/framework/types.h"
46 #include "tensorflow/core/graph/algorithm.h"
47 #include "tensorflow/core/graph/graph.h"
48 #include "tensorflow/core/grappler/op_types.h"
49 #include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
50 #include "tensorflow/core/lib/core/errors.h"
51 #include "tensorflow/core/lib/core/status.h"
52 #include "tensorflow/core/lib/strings/numbers.h"
53 #include "tensorflow/core/lib/strings/str_util.h"
54 #include "tensorflow/core/lib/strings/strcat.h"
55 #include "tensorflow/core/platform/logging.h"
56 #include "tensorflow/core/platform/mutex.h"
57 #include "tensorflow/core/platform/protobuf.h"
58 #include "tensorflow/core/platform/tensor_coding.h"
59 #include "tensorflow/core/platform/types.h"
60 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
61 #include "tensorflow/core/public/version.h"
62 #include "tensorflow/core/util/env_var.h"
63 #include "tensorflow/core/util/strided_slice_op.h"
64
65 #if GOOGLE_CUDA && GOOGLE_TENSORRT
66 #include "third_party/tensorrt/NvInfer.h"
67 #include "third_party/tensorrt/NvInferPlugin.h"
68
69 // Check if the types are equal. Cast to int first so that failure log message
70 // would work!
71 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
72
73 #define TFTRT_INTERNAL_ERROR_AT_NODE(node) \
74 do { \
75 return errors::Internal("TFTRT::", __FUNCTION__, ":", __LINE__, \
76 " failed to add TRT layer, at: ", node); \
77 } while (0)
78
79 #define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \
80 do { \
81 if (ptr == nullptr) { \
82 TFTRT_INTERNAL_ERROR_AT_NODE(node); \
83 } \
84 } while (0)
85
86 namespace tensorflow {
87 namespace tensorrt {
88 namespace convert {
89
90 using absl::StrAppend;
91 using absl::StrCat;
92
93 namespace {
94
95 #define ADD_LAYER(layer_name) \
96 case nvinfer1::LayerType::k##layer_name: \
97 return #layer_name;
98
LayerTypeToString(nvinfer1::LayerType layer_type)99 const char* LayerTypeToString(nvinfer1::LayerType layer_type) {
100 switch (layer_type) {
101 ADD_LAYER(CONVOLUTION)
102 ADD_LAYER(FULLY_CONNECTED)
103 ADD_LAYER(ACTIVATION)
104 ADD_LAYER(POOLING)
105 ADD_LAYER(LRN)
106 ADD_LAYER(SCALE)
107 ADD_LAYER(SOFTMAX)
108 ADD_LAYER(DECONVOLUTION)
109 ADD_LAYER(CONCATENATION)
110 ADD_LAYER(ELEMENTWISE)
111 ADD_LAYER(PLUGIN)
112 ADD_LAYER(UNARY)
113 ADD_LAYER(PADDING)
114 ADD_LAYER(SHUFFLE)
115 ADD_LAYER(REDUCE)
116 ADD_LAYER(TOPK)
117 ADD_LAYER(GATHER)
118 ADD_LAYER(MATRIX_MULTIPLY)
119 ADD_LAYER(RAGGED_SOFTMAX)
120 ADD_LAYER(CONSTANT)
121 ADD_LAYER(RNN_V2)
122 ADD_LAYER(IDENTITY)
123 ADD_LAYER(PLUGIN_V2)
124 ADD_LAYER(SLICE)
125 ADD_LAYER(SHAPE)
126 ADD_LAYER(PARAMETRIC_RELU)
127 ADD_LAYER(RESIZE)
128 ADD_LAYER(TRIP_LIMIT)
129 ADD_LAYER(RECURRENCE)
130 ADD_LAYER(ITERATOR)
131 ADD_LAYER(LOOP_OUTPUT)
132 ADD_LAYER(SELECT)
133 ADD_LAYER(FILL)
134 #if IS_TRT_VERSION_GE(8, 0, 0, 0)
135 ADD_LAYER(QUANTIZE)
136 ADD_LAYER(DEQUANTIZE)
137 #else
138 // The TRT IRNNv2Layer has been deprecated in favor of the loop API.
139 ADD_LAYER(RNN)
140 #endif
141 }
142 return "UNKNOWN_LAYER";
143 }
144
145 #undef ADD_LAYER
146
147 // Sets the ILayer name in the form of
148 // <engine_name>/<tf_related_part>:<trt_operation_name>.
SetLayerNameHelper(nvinfer1::ILayer * layer,absl::string_view engine_name,absl::string_view tf_name)149 void SetLayerNameHelper(nvinfer1::ILayer* layer, absl::string_view engine_name,
150 absl::string_view tf_name) {
151 const char* trt_name = LayerTypeToString(layer->getType());
152 layer->setName(
153 absl::StrCat(engine_name, "/", tf_name, ":", trt_name).c_str());
154 }
155
156 // Returns a string in the form of <sub_op_name><sub_op_instance>.
GetLayerNameSuffix(absl::string_view sub_op_name,absl::optional<int> sub_op_instance)157 std::string GetLayerNameSuffix(absl::string_view sub_op_name,
158 absl::optional<int> sub_op_instance) {
159 std::string op_suffix(sub_op_name);
160 if (sub_op_instance.has_value()) {
161 op_suffix =
162 absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value()));
163 }
164 return op_suffix;
165 }
166
167 } // namespace
168
IsEngineInput(absl::string_view name)169 bool IsEngineInput(absl::string_view name) {
170 return absl::StartsWith(name, IONamePrefixes::kInputPHName);
171 }
IsEngineOutput(absl::string_view name)172 bool IsEngineOutput(absl::string_view name) {
173 return absl::StartsWith(name, IONamePrefixes::kOutputPHName);
174 }
175
176 class TFAttrs {
177 public:
TFAttrs(const NodeDef & tf_node)178 explicit TFAttrs(const NodeDef& tf_node) {
179 for (const auto& attr : tf_node.attr()) {
180 attrs_.insert({attr.first, &attr.second});
181 }
182 }
183
count(const string & key) const184 bool count(const string& key) const { return attrs_.count(key); }
185
at(const string & key) const186 AttrValue const* at(const string& key) const {
187 if (!attrs_.count(key)) {
188 LOG(FATAL) << "Attribute not found: " << key;
189 }
190 return attrs_.at(key);
191 }
192
193 template <typename T>
194 T get(const string& key) const;
195
196 template <typename T>
get(const string & key,const T & default_value) const197 T get(const string& key, const T& default_value) const {
198 return attrs_.count(key) ? this->get<T>(key) : default_value;
199 }
200
201 private:
202 std::map<string, AttrValue const*> attrs_;
203 };
204
205 template <>
get(const string & key) const206 string TFAttrs::get<string>(const string& key) const {
207 return this->at(key)->s();
208 }
209
210 template <>
get(const string & key) const211 std::vector<int64> TFAttrs::get<std::vector<int64>>(const string& key) const {
212 auto attr = this->at(key)->list().i();
213 return std::vector<int64>(attr.begin(), attr.end());
214 }
215
216 template <>
get(const string & key) const217 std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
218 auto attr = this->at(key)->list().f();
219 return std::vector<float>(attr.begin(), attr.end());
220 }
221
222 template <>
get(const string & key) const223 nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
224 nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
225 TF_CHECK_OK(TfTypeToTrtType(this->at(key)->type(), &trt_dtype));
226 return trt_dtype;
227 }
228
229 template <>
get(const string & key) const230 DataType TFAttrs::get<DataType>(const string& key) const {
231 return this->at(key)->type();
232 }
233
234 template <>
get(const string & key) const235 float TFAttrs::get<float>(const string& key) const {
236 return this->at(key)->f();
237 }
238
239 template <>
get(const string & key) const240 bool TFAttrs::get<bool>(const string& key) const {
241 return this->at(key)->b();
242 }
243
244 template <>
get(const string & key) const245 int64 TFAttrs::get<int64>(const string& key) const {
246 return this->at(key)->i();
247 }
248
249 // TODO(laigd): use this utility function in more places.
RemoveBatchDimension(nvinfer1::Dims * dims)250 Status RemoveBatchDimension(nvinfer1::Dims* dims) {
251 if (dims->nbDims < 2) {
252 return errors::InvalidArgument(
253 "Dropping batch dimension requires dims with rank>=2.");
254 }
255 std::copy(dims->d + 1, dims->d + dims->nbDims, dims->d);
256 dims->nbDims--;
257 return Status::OK();
258 }
259
GetOutputProperties(const grappler::GraphProperties & graph_properties,const Node * node,const int out_port,PartialTensorShape * shape,DataType * dtype)260 void GetOutputProperties(const grappler::GraphProperties& graph_properties,
261 const Node* node, const int out_port,
262 PartialTensorShape* shape, DataType* dtype) {
263 if (graph_properties.HasOutputProperties(node->name())) {
264 auto output_params = graph_properties.GetOutputProperties(node->name());
265 auto out_shape = output_params.at(out_port);
266 *dtype = out_shape.dtype();
267 *shape = out_shape.shape();
268 } else {
269 LOG(INFO) << "Unknown output shape" << node->name();
270 *dtype = node->output_type(out_port);
271 }
272 }
273
GetInputProperties(const grappler::GraphProperties & graph_properties,const Node * node,const int in_port,PartialTensorShape * shape,DataType * dtype)274 void GetInputProperties(const grappler::GraphProperties& graph_properties,
275 const Node* node, const int in_port,
276 PartialTensorShape* shape, DataType* dtype) {
277 if (graph_properties.HasInputProperties(node->name())) {
278 auto input_params = graph_properties.GetInputProperties(node->name());
279 auto in_shape = input_params.at(in_port);
280 *dtype = in_shape.dtype();
281 *shape = in_shape.shape();
282 } else {
283 *dtype = node->input_type(in_port);
284 }
285 }
286
287 // This function checks if a tensor is compatible with TRT.
288 //
289 // We check that the shape and datatype are compatible with TensorRT. We also
290 // return the corresponding trt_dtype, the trt_dims and the batch_size (latter
291 // is only needed in implicit batch mode).
292 //
293 // The return status indicates wether the tensor is compatible.
294 //
295 // For implicit batch mode, when validation_only == false, we also check that
296 // all input dimensions (besides the batch dimension) are known dimensions.
ValidateTensorProperties(const string & producer_node_type,const DataType dtype,const PartialTensorShape & shape,const bool use_implicit_batch,bool validation_only,nvinfer1::DataType * trt_dtype,nvinfer1::Dims * trt_dims,int * batch_size)297 Status ValidateTensorProperties(const string& producer_node_type,
298 const DataType dtype,
299 const PartialTensorShape& shape,
300 const bool use_implicit_batch,
301 bool validation_only,
302 nvinfer1::DataType* trt_dtype,
303 nvinfer1::Dims* trt_dims, int* batch_size) {
304 // Convert data type.
305 TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, trt_dtype));
306
307 // Convert shape.
308 if (shape.dims() < 0) {
309 return errors::InvalidArgument("Input tensor rank is unknown.");
310 }
311 // Add 1 to maximum rank for implicit batch dim.
312 const int max_rank = nvinfer1::Dims::MAX_DIMS + (use_implicit_batch ? 1 : 0);
313 if (shape.dims() > max_rank) {
314 return errors::OutOfRange("Input tensor rank is greater than ", max_rank);
315 }
316 if (use_implicit_batch && (producer_node_type != "Const") &&
317 (shape.dims() < 1)) {
318 return errors::InvalidArgument(
319 "Scalar input tensor is not supported since the first dimension "
320 "is treated as batch dimension by TRT");
321 }
322 TF_RETURN_IF_ERROR(
323 TensorShapeToTrtDims(shape,
324 /*ignore_first_dim=*/use_implicit_batch, trt_dims));
325 // Get batch size for tensor if it will not be included the shape.
326 if (use_implicit_batch) {
327 *batch_size = shape.dim_size(0);
328 }
329
330 // Don't convert empty tensors (dim value of 0).
331 const int first_trt_dim = use_implicit_batch ? 1 : 0;
332 for (int d = first_trt_dim; d < shape.dims(); ++d) {
333 if (shape.dim_size(d) == 0) {
334 return errors::Unimplemented(
335 "Input tensor with shape ", shape.DebugString(),
336 " is an empty tensor, which is not supported by TRT");
337 }
338 }
339
340 if (validation_only) return Status::OK();
341
342 // Following checks are only used during TRT engine creation time.
343 if (use_implicit_batch) {
344 for (int d = first_trt_dim; d < shape.dims(); ++d) {
345 if (shape.dim_size(d) < 0) {
346 return errors::InvalidArgument(
347 "Input tensor with shape ", shape.DebugString(),
348 " has an unknown non-batch dimension at dim ", d);
349 }
350 }
351 }
352 return Status::OK();
353 }
354
GetTrtBroadcastShape(const TRT_TensorOrWeights & operand_l,const TRT_TensorOrWeights & operand_r,const bool check_feasibility,const bool use_implicit_batch,nvinfer1::Dims * operand_l_new_dims,nvinfer1::Dims * operand_r_new_dims)355 Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
356 const TRT_TensorOrWeights& operand_r,
357 const bool check_feasibility,
358 const bool use_implicit_batch,
359 nvinfer1::Dims* operand_l_new_dims,
360 nvinfer1::Dims* operand_r_new_dims) {
361 // TensorRT Elementwise op supports broadcast but requires both tensor to be
362 // of Identical rank
363 //
364 // We consider case of:
365 // 1. operand_l to be a Tensor & operand_r to be a Const;
366 // 2. operand_l to be a Tensor & operand_r to be a Tensor;
367 // note: const op const (constant folding) should fallback to TensorFlow
368 //
369 // broadcast scheme:
370 // T: 1 3 5 (tensor would not have batch dimension)
371 // W: 1 1 3 1 (weight would have all explicit dimensions)
372 // i. fill in explicit dimensions
373 // -> T: -1 1 3 5 (we put a -1 for batch dimension)
374 // -> W: 1 1 3 1
375 // ii. compare broadcast feasibility
376 //
377 // We cannot support the following since TensorRT does not allow manipulation
378 // on batch dimension, we cannot generate output with proper shape
379 // T: 3 5 1
380 // W: 1 1 1 1 3 5 1
381 // -> T: 1 1 1 -1 3 5 1
382 // -> W: 1 1 1 1 3 5 1
383 // ***************************************************************************
384 if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
385 return errors::InvalidArgument(
386 "Broadcasting requires at least one of the operands be tensors");
387 }
388
389 const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
390 auto compute_output_dims = [use_implicit_batch](
391 const TRT_TensorOrWeights& input,
392 int broadcast_num_dims, int* output_dims_array,
393 nvinfer1::Dims* output_dims) {
394 const nvinfer1::Dims input_dims = input.GetTrtDims();
395 std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
396 std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
397 output_dims_array + broadcast_num_dims - input_dims.nbDims);
398 if (use_implicit_batch && input.is_tensor()) {
399 const int true_input_dims = input_dims.nbDims + 1;
400 if (true_input_dims < broadcast_num_dims) {
401 return errors::InvalidArgument(
402 "Broadcasting beyond batch dimension is not supported ",
403 "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
404 broadcast_num_dims, ")");
405 }
406 // Set the batch dimension to -1, since batch size is not supposed to
407 // be broadcasted.
408 output_dims_array[0] = -1;
409 }
410 // Copy to output dimensions
411 if (use_implicit_batch) {
412 // Strip batch dimension while copying
413 output_dims->nbDims = broadcast_num_dims - 1;
414 std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
415 output_dims->d);
416 } else {
417 output_dims->nbDims = broadcast_num_dims;
418 std::copy(output_dims_array, output_dims_array + broadcast_num_dims,
419 output_dims->d);
420 }
421
422 return Status::OK();
423 };
424
425 // Compute the output dimensions.
426 const int broadcast_num_dims =
427 std::max(operand_l.GetTrtDims().nbDims +
428 (use_implicit_batch && operand_l.is_tensor()),
429 operand_r.GetTrtDims().nbDims +
430 (use_implicit_batch && operand_r.is_tensor()));
431 int output_l[max_nb_dims], output_r[max_nb_dims];
432 TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
433 output_l, operand_l_new_dims));
434 TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
435 output_r, operand_r_new_dims));
436
437 // Compare broadcast feasibility
438 if (check_feasibility) {
439 for (int i = 0; i < broadcast_num_dims; ++i) {
440 if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) {
441 // If the condition is true then we are in explicit batch mode and (at
442 // least) one of the input dimensions are unknown. In other words we
443 // are in dynamic shape mode. During conversion time we only see -1 for
444 // the unknown shapes, therefore we cannot decide on the feasibility of
445 // broadcast over the unknown dimensions. Therefore we just continue for
446 // the next dimension. In dynamic shape mode TRT can only check the
447 // feasibility of the broadcast when the actual input dimensions are
448 // specified by SetTrtEngineInputs and the inference job is launched by
449 // TrtEnque.
450 continue;
451 }
452 if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
453 (output_r[i] != 1)) {
454 return errors::InvalidArgument("Infeasible broadcast scheme (",
455 "batch_dim: ", output_l[0], ", ",
456 DebugString(*operand_l_new_dims), " vs ",
457 "batch_dim: ", output_r[0], ", ",
458 DebugString(*operand_r_new_dims), ")");
459 }
460 }
461 }
462 return Status::OK();
463 }
464
465 // Prepares a dynamic shape tensor for broadcast by adding leading 1 dimensions.
DynamicBroadcast(ITensorProxyPtr operand,OpConverterParams * params,ITensorProxyPtr * output,int broadcasted_nbDims)466 Status DynamicBroadcast(ITensorProxyPtr operand, OpConverterParams* params,
467 ITensorProxyPtr* output, int broadcasted_nbDims) {
468 int operand_nbDims = operand->getDimensions().nbDims;
469 if (broadcasted_nbDims > operand_nbDims) {
470 if (params->validation_only) return Status::OK();
471 int n_extra_dims = broadcasted_nbDims - operand_nbDims;
472 VLOG(2) << "Dynamic broadcast adding " << n_extra_dims << " leading 1s";
473 TF_RETURN_IF_ERROR(params->converter->DynamicReshape(
474 operand, {std::make_pair(0, operand_nbDims)}, params, output,
475 {n_extra_dims}));
476 } else {
477 *output = operand;
478 }
479 return Status::OK();
480 }
481
BroadcastWeights(std::unique_ptr<TRT_TensorOrWeights> & p,nvinfer1::Dims broadcasted_dims)482 Status BroadcastWeights(std::unique_ptr<TRT_TensorOrWeights>& p,
483 nvinfer1::Dims broadcasted_dims) {
484 if (!p->is_weights()) return errors::Internal("Weight input expected");
485 if (p->GetTrtDims().nbDims != broadcasted_dims.nbDims) {
486 TRT_ShapedWeights weights(p->weights());
487 TF_RETURN_IF_ERROR(weights.SetShape(broadcasted_dims));
488 p = std::make_unique<TRT_TensorOrWeights>(weights);
489 }
490 return Status::OK();
491 }
492
ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights> & operand,nvinfer1::Dims broadcasted_dims,OpConverterParams * params)493 Status ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights>& operand,
494 nvinfer1::Dims broadcasted_dims,
495 OpConverterParams* params) {
496 if (operand->is_weights()) {
497 TF_RETURN_IF_ERROR(BroadcastWeights(operand, broadcasted_dims));
498 } else {
499 ITensorProxyPtr tensor = nullptr;
500 auto is_static_shuffle_compatible = [](nvinfer1::Dims dims) {
501 return std::count(dims.d, dims.d + dims.nbDims, -1) <= 1;
502 };
503 if (is_static_shuffle_compatible(broadcasted_dims)) {
504 TF_RETURN_IF_ERROR(PrepareTensorForShape(
505 params->converter, *operand, broadcasted_dims,
506 params->validation_only, &tensor, params->node_def));
507 } else {
508 TF_RETURN_IF_ERROR(DynamicBroadcast(operand->tensor(), params, &tensor,
509 broadcasted_dims.nbDims));
510 }
511 operand = std::make_unique<TRT_TensorOrWeights>(tensor);
512 }
513 return Status::OK();
514 }
515
516 // Inserts leading 1 dimensions so that both operands have the same rank.
517 // Note: In implicit batch mode, weights' shape can include an explicit 1 batch
518 // dimension. The broadcasted shape might loose this leading batch dim, because
519 // the broadcasted shape does not include the implicit batch dim.
520 // TODO(tfeher): Other code blocks that use GetTrtBroadcastShape need to be
521 // fixed to use this routine to handle dynamic inputs. Eventually,
522 // GetTrtBroadcastShape should only be used by this routine.
BroadcastTensors(std::unique_ptr<TRT_TensorOrWeights> & operand_l,std::unique_ptr<TRT_TensorOrWeights> & operand_r,bool check_feasibility,OpConverterParams * params)523 Status BroadcastTensors(std::unique_ptr<TRT_TensorOrWeights>& operand_l,
524 std::unique_ptr<TRT_TensorOrWeights>& operand_r,
525 bool check_feasibility, OpConverterParams* params) {
526 nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
527 TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
528 *operand_l, *operand_r, check_feasibility, params->use_implicit_batch,
529 &broadcasted_dims_l, &broadcasted_dims_r));
530
531 if (params->validation_only) return Status::OK();
532
533 TF_RETURN_IF_ERROR(ApplyBroadcast(operand_l, broadcasted_dims_l, params));
534 TF_RETURN_IF_ERROR(ApplyBroadcast(operand_r, broadcasted_dims_r, params));
535
536 return Status::OK();
537 }
538
CreateConstantLayer(const TRT_ShapedWeights & weights,const nvinfer1::Dims & dims)539 ITensorProxyPtr Converter::CreateConstantLayer(const TRT_ShapedWeights& weights,
540 const nvinfer1::Dims& dims) {
541 nvinfer1::Weights trt_weights = weights.GetTrtWeights();
542 nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
543 if (!layer) return nullptr;
544 SetLayerName(layer, "_tftrt_constant_",
545 std::to_string(next_constant_layer_id_));
546 next_constant_layer_id_++;
547 ITensorProxyPtr trt_tensor = layer->getOutput(0);
548 return trt_tensor;
549 }
550
551 // Creates a scalar constant and fills with value.
552 template <typename T>
CreateScalarConstant(OpConverterParams * params,T value,ITensorProxyPtr * tensor,nvinfer1::DataType trt_type=nvinfer1::DataType::kINT32,const nvinfer1::Dims & dims={1, {1}})553 Status CreateScalarConstant(
554 OpConverterParams* params, T value, ITensorProxyPtr* tensor,
555 nvinfer1::DataType trt_type = nvinfer1::DataType::kINT32,
556 const nvinfer1::Dims& dims = {1, {1}}) {
557 TRT_ShapedWeights weights =
558 params->weight_store->GetTempWeights(trt_type, dims);
559 TF_RETURN_IF_ERROR(weights.SetValues(value));
560 *tensor = params->converter->CreateConstantLayer(weights, dims);
561 TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
562 return Status::OK();
563 }
564
565 // Creates a constant with the same rank as dims, where each dimension has
566 // size = 1.
CreateBroadcastableScalarConstant(OpConverterParams * params,float value,const nvinfer1::Dims & dims,ITensorProxyPtr * tensor,const char * dtype_attr_name="T")567 Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value,
568 const nvinfer1::Dims& dims,
569 ITensorProxyPtr* tensor,
570 const char* dtype_attr_name = "T") {
571 nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT; // Default to FP32.
572 TFAttrs attrs(params->node_def);
573 if (attrs.count(dtype_attr_name)) {
574 DataType dtype = attrs.get<DataType>(dtype_attr_name);
575 TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, &trt_type));
576 }
577
578 // In order to be broadcastable, the number of dims has to match.
579 nvinfer1::Dims broadcastable_dims(dims);
580 for (int i = 0; i < broadcastable_dims.nbDims; i++) {
581 broadcastable_dims.d[i] = 1;
582 }
583 return CreateScalarConstant(params, value, tensor, trt_type,
584 broadcastable_dims);
585 }
586
587 // The function concatenates tensors on the first axis. This can be used to
588 // create a shape tensor from individual dimension sizes.
ConcatenateTensors(OpConverterParams * params,const std::vector<ITensorProxyPtr> input_tensors,absl::optional<int> op_instance=absl::nullopt)589 StatusOr<ITensorProxyPtr> ConcatenateTensors(
590 OpConverterParams* params, const std::vector<ITensorProxyPtr> input_tensors,
591 absl::optional<int> op_instance = absl::nullopt) {
592 std::vector<nvinfer1::ITensor*> trt_input_tensors;
593 for (const auto& t : input_tensors) {
594 trt_input_tensors.push_back(t->trt_tensor());
595 }
596 nvinfer1::IConcatenationLayer* layer =
597 params->converter->network()->addConcatenation(
598 static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
599 input_tensors.size());
600 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.op());
601 params->converter->SetLayerName(layer, params->node_def.name(),
602 "concat_shapes", op_instance);
603 layer->setAxis(0);
604 return ITensorProxyPtr(layer->getOutput(0));
605 }
606
607 // Convert an axis from TF format to TRT format while validating. TF format
608 // includes the batch dimension, while TRT does not if implicit batching is used
609 // (i.e. for tensors). TF can also use negative indices.
ConvertAxis(int tf_axis,int trt_nb_dims,absl::string_view node_name,bool use_implicit_batch,int * trt_axis)610 Status ConvertAxis(int tf_axis, int trt_nb_dims, absl::string_view node_name,
611 bool use_implicit_batch, int* trt_axis) {
612 const int tf_nb_dims = trt_nb_dims + (use_implicit_batch ? 1 : 0);
613 // Check bounds.
614 if (tf_axis < -tf_nb_dims || tf_axis >= tf_nb_dims) {
615 return errors::InvalidArgument(
616 "Axis value of ", tf_axis, " is out of bounds, must be in range [",
617 -tf_nb_dims, ", ", tf_nb_dims, "), at ", node_name);
618 }
619 // Make negative axis positive.
620 if (tf_axis < 0) tf_axis += tf_nb_dims;
621 // Don't allow axis to be the batch dimension.
622 if (use_implicit_batch && tf_axis == 0) {
623 return errors::Unimplemented(
624 "TensorRT does not allow manipulation of the batch dimension, at ",
625 node_name);
626 }
627 // Remove batch dimension if it is implicit.
628 *trt_axis = use_implicit_batch ? tf_axis - 1 : tf_axis;
629 return Status::OK();
630 }
631
DimsEqual(const nvinfer1::Dims & dim_l,const nvinfer1::Dims & dim_r)632 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
633 const nvinfer1::Dims& dim_r) {
634 if (dim_l.nbDims != dim_r.nbDims) {
635 return false;
636 }
637 for (int i = 0; i < dim_l.nbDims; i++) {
638 if (dim_l.d[i] != dim_r.d[i]) {
639 return false;
640 }
641 }
642 return true;
643 }
644
AllLengthsEqual(const std::vector<std::vector<int>> & inputs)645 bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
646 if (inputs.size() == 0) return true;
647 int length = inputs.at(0).size();
648 for (int i = 1; i < inputs.size(); i++) {
649 if (inputs.at(i).size() != length) return false;
650 }
651 return true;
652 }
653
GetTrtDimsForTensor(const Tensor & tensor)654 inline nvinfer1::Dims GetTrtDimsForTensor(const Tensor& tensor) {
655 nvinfer1::Dims dims;
656 dims.nbDims = tensor.dims();
657 for (int i = 0; i < dims.nbDims; i++) {
658 dims.d[i] = tensor.dim_size(i);
659 }
660 return dims;
661 }
662
Prod(const nvinfer1::Dims & dims)663 int64_t Prod(const nvinfer1::Dims& dims) {
664 int64_t count = 1;
665 for (int d = 0; d < dims.nbDims; ++d) {
666 count *= dims.d[d];
667 }
668 return count;
669 }
670
671 // Returns total number of elements in an ITensor dimension.
672 // Returns 1 if the number of dims is 0 (the total number is fully determined by
673 // the batch size).
674 // Returns -1 if any dimension is known.
TrtTensorDimsNumElements(const nvinfer1::Dims & dims)675 int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims) {
676 if (!HasStaticShape(dims)) return -1;
677 return Prod(dims);
678 }
679
DimsHaveSameSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)680 bool DimsHaveSameSize(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
681 return TrtTensorDimsNumElements(lhs) == TrtTensorDimsNumElements(rhs);
682 }
683
684 // Returns whether both dimensions are fully specified and the total number of
685 // elements equals.
AreDimsStaticWithSameSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)686 bool AreDimsStaticWithSameSize(const nvinfer1::Dims& lhs,
687 const nvinfer1::Dims& rhs) {
688 if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
689 return DimsHaveSameSize(lhs, rhs);
690 }
691
AreDimsStaticWithDifferentSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)692 bool AreDimsStaticWithDifferentSize(const nvinfer1::Dims& lhs,
693 const nvinfer1::Dims& rhs) {
694 if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
695 return !DimsHaveSameSize(lhs, rhs);
696 }
697
CreateSamePadding(const nvinfer1::Dims & stride,const nvinfer1::Dims & kernel,const std::vector<int64_t> & input_dims)698 static std::vector<std::pair<int, int>> CreateSamePadding(
699 const nvinfer1::Dims& stride, const nvinfer1::Dims& kernel,
700 const std::vector<int64_t>& input_dims) {
701 std::vector<std::pair<int, int>> padding(input_dims.size());
702 CHECK_EQ(stride.nbDims, input_dims.size()); // TODO(jie): N+C? NC+?
703
704 for (size_t i = 0; i < input_dims.size(); ++i) {
705 // Formula to calculate the padding
706 int p = ((input_dims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
707 input_dims[i];
708 p = (p > 0) ? p : 0;
709
710 // Right precedence padding, like in TensorFlow
711 int left = p / 2;
712 int right = p - left;
713
714 VLOG(2) << "PADDING_" << i << " pre: " << left << ", post: " << right
715 << "paras: " << input_dims[i] << ", " << stride.d[i] << ", "
716 << "kernel: " << kernel.d[i];
717 padding[i] = {left, right};
718 }
719 return padding;
720 }
721
GetCommonNameScope(const string & op_name_a,const string & op_name_b)722 string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
723 size_t last_scope_separator = 0;
724 const size_t min_size = std::min(op_name_a.size(), op_name_b.size());
725 for (size_t i = 0; i < min_size; ++i) {
726 if (op_name_a[i] != op_name_b[i]) break;
727 if (op_name_a[i] == '/') last_scope_separator = i + 1;
728 }
729 return op_name_a.substr(0, last_scope_separator);
730 }
731
732 // Verifies that shapes of the given inputs match after masking the specified
733 // dimension.
VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,int masked_dim,absl::string_view node_name)734 Status VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,
735 int masked_dim, absl::string_view node_name) {
736 size_t num_inputs = inputs.size();
737 if (num_inputs <= 1) return Status::OK();
738
739 const nvinfer1::Dims dims_0 = inputs.at(0).GetTrtDims();
740 for (size_t i = 1; i < num_inputs; ++i) {
741 const nvinfer1::Dims dim_i = inputs.at(i).GetTrtDims();
742 if (dim_i.nbDims != dims_0.nbDims) {
743 return errors::InvalidArgument(
744 "Received inputs with inconsistent rank, at ", node_name);
745 }
746 for (size_t j = 0; j < dims_0.nbDims; ++j) {
747 // Dynamic dimensions will be verified at runtime.
748 if (dim_i.d[j] == -1 || dims_0.d[j] == -1) continue;
749 if (dim_i.d[j] != dims_0.d[j] && j != masked_dim) {
750 return errors::InvalidArgument(
751 "Received inputs with inconsistent shape, at ", node_name);
752 }
753 }
754 }
755 return Status::OK();
756 }
757
TRT_ShapedWeights(nvinfer1::DataType type)758 TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type) : type_(type) {
759 shape_.nbDims = 0;
760 shape_.d[0] = 0;
761 }
762
TRT_ShapedWeights(nvinfer1::DataType type,nvinfer1::Dims dims,Tensor tensor)763 TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type,
764 nvinfer1::Dims dims, Tensor tensor)
765 : shape_(dims), type_(type), tensor_(tensor) {
766 if (dims.nbDims == 0) {
767 DCHECK(dims.d[0] == 0 || dims.d[0] == 1);
768 }
769 }
770
TRT_ShapedWeights(const TRT_ShapedWeights & rhs)771 TRT_ShapedWeights::TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
772 : shape_(rhs.shape_), type_(rhs.type_), tensor_(rhs.tensor_) {}
773
count(nvinfer1::Dims dims)774 int64_t TRT_ShapedWeights::count(nvinfer1::Dims dims) {
775 if (dims.nbDims == 0) {
776 assert(dims.d[0] == 0 || dims.d[0] == 1);
777 return dims.d[0];
778 }
779 return Prod(dims);
780 }
781
GetTrtWeights() const782 nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
783 return nvinfer1::Weights{type_, GetValues(), count()};
784 }
785
786 template <typename T>
SetValues(T value)787 Status TRT_ShapedWeights::SetValues(T value) {
788 switch (type_) {
789 case nvinfer1::DataType::kFLOAT: {
790 float* ptr = tensor_.flat<float>().data();
791 std::fill(ptr, ptr + count(), value);
792 break;
793 }
794 case nvinfer1::DataType::kHALF: {
795 Eigen::half* ptr = tensor_.flat<Eigen::half>().data();
796 std::fill(ptr, ptr + count(), Eigen::half(value));
797 break;
798 }
799 case nvinfer1::DataType::kINT32: {
800 int32* ptr = tensor_.flat<int32>().data();
801 std::fill(ptr, ptr + count(), value);
802 break;
803 }
804 default:
805 return errors::InvalidArgument("Unsupported data type ",
806 tensorflow::tensorrt::DebugString(type_));
807 }
808 return Status::OK();
809 }
810
SetShape(nvinfer1::Dims dims)811 Status TRT_ShapedWeights::SetShape(nvinfer1::Dims dims) {
812 if (this->count() != TRT_ShapedWeights::count(dims)) {
813 VLOG(2) << "Changing shape from "
814 << tensorflow::tensorrt::DebugString(shape_) << ", to "
815 << tensorflow::tensorrt::DebugString(dims);
816 return errors::Internal("SetShape would change number of elements");
817 }
818 shape_ = dims;
819 return Status::OK();
820 }
821
size_bytes() const822 size_t TRT_ShapedWeights::size_bytes() const {
823 size_t data_type_size = -1;
824 switch (type_) {
825 case nvinfer1::DataType::kFLOAT:
826 case nvinfer1::DataType::kINT32:
827 data_type_size = 4;
828 break;
829 case nvinfer1::DataType::kHALF:
830 data_type_size = 2;
831 break;
832 case nvinfer1::DataType::kINT8:
833 case nvinfer1::DataType::kBOOL:
834 data_type_size = 1;
835 break;
836 }
837 return this->count() * data_type_size;
838 }
839
DebugString() const840 string TRT_ShapedWeights::DebugString() const {
841 return StrCat(
842 "TRT_ShapedWeights(shape=", tensorflow::tensorrt::DebugString(shape_),
843 ", type=", tensorflow::tensorrt::DebugString(type_),
844 ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
845 }
846
TRT_TensorOrWeights(ITensorProxyPtr tensor)847 TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor)
848 : tensor_proxy_ptr_(tensor), initialized_(true), is_tensor_(true) {}
849
TRT_TensorOrWeights(ITensorProxyPtr tensor,int batch_size)850 TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size)
851 : tensor_proxy_ptr_(tensor),
852 batch_size_(batch_size),
853 initialized_(true),
854 is_tensor_(true) {}
855
TRT_TensorOrWeights(nvinfer1::ITensor * tensor,int batch_size)856 TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor,
857 int batch_size)
858 : tensor_proxy_ptr_(tensor),
859 batch_size_(batch_size),
860 initialized_(true),
861 is_tensor_(true) {}
862
TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,const nvinfer1::Dims & trt_dims,int batch_size)863 TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
864 const nvinfer1::Dims& trt_dims,
865 int batch_size)
866 : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)),
867 batch_size_(batch_size),
868 initialized_(true),
869 is_tensor_(true) {}
870
TRT_TensorOrWeights(const TRT_ShapedWeights & weights)871 TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
872 : weights_(weights), initialized_(true), is_tensor_(false) {}
873
TRT_TensorOrWeights(const TRT_TensorOrWeights & rhs)874 TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
875 : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_),
876 batch_size_(rhs.batch_size_),
877 weights_(rhs.weights_),
878 initialized_(rhs.initialized_),
879 is_tensor_(rhs.is_tensor_) {}
880
operator =(const TRT_TensorOrWeights & rhs)881 void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) {
882 tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_;
883 batch_size_ = rhs.batch_size_;
884 weights_ = rhs.weights_;
885 initialized_ = rhs.initialized_;
886 is_tensor_ = rhs.is_tensor_;
887 }
888
tensor() const889 ITensorProxyPtr TRT_TensorOrWeights::tensor() const {
890 CHECK(is_tensor());
891 return tensor_proxy_ptr_;
892 }
893
GetTrtDims() const894 nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
895 if (is_tensor()) {
896 return tensor()->getDimensions();
897 } else {
898 return weights().shape_;
899 }
900 }
901
GetTfType(DataType * tf_type) const902 Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
903 if (is_tensor()) {
904 nvinfer1::DataType trt_type = tensor()->getType();
905 return TrtTypeToTfType(trt_type, tf_type);
906 }
907 if (is_weights()) {
908 *tf_type = weights().GetTensor().dtype();
909 return Status::OK();
910 }
911 return errors::Internal("The object is probably not initialized");
912 }
913
DebugString() const914 string TRT_TensorOrWeights::DebugString() const {
915 string output = "TRT_TensorOrWeights(type=";
916 if (is_tensor()) {
917 StrAppend(&output, "tensor=", tensorflow::tensorrt::DebugString(tensor()),
918 ", batch_size=", batch_size_);
919 } else {
920 StrAppend(&output, "weights=", weights_.DebugString());
921 }
922 StrAppend(&output, ")");
923 return output;
924 }
925
926 // Perform 5 dimensional reorder of data on CPU
927 // This is done once at convert time and does not affect GPU inference perf
928 // Example: reorder NDHWC (Tensorflow) -> NCDHW (TensorRT)
929 template <typename T>
Reorder5(const nvinfer1::Dims & shape,const T * idata,const nvinfer1::Dims & istrides,T * odata,const nvinfer1::Dims & ostrides)930 void Reorder5(const nvinfer1::Dims& shape, const T* idata,
931 const nvinfer1::Dims& istrides, T* odata,
932 const nvinfer1::Dims& ostrides) {
933 for (int k = 0; k < shape.d[0]; ++k) {
934 for (int c = 0; c < shape.d[1]; ++c) {
935 for (int d = 0; d < shape.d[2]; ++d) {
936 for (int r = 0; r < shape.d[3]; ++r) {
937 for (int s = 0; s < shape.d[4]; ++s) {
938 odata[k * ostrides.d[0] + c * ostrides.d[1] + d * ostrides.d[2] +
939 r * ostrides.d[3] + s * ostrides.d[4]] =
940 idata[k * istrides.d[0] + c * istrides.d[1] +
941 d * istrides.d[2] + r * istrides.d[3] +
942 s * istrides.d[4]];
943 }
944 }
945 }
946 }
947 }
948 }
949
950 // TODO(jie): reorder4 & reorder2 should be merged?
951 // TODO(aaroey): fix the order of parameters.
952 template <typename T>
Reorder4(const nvinfer1::Dims4 & shape,const T * idata,const nvinfer1::Dims4 & istrides,T * odata,const nvinfer1::Dims4 & ostrides)953 void Reorder4(const nvinfer1::Dims4& shape, const T* idata,
954 const nvinfer1::Dims4& istrides, T* odata,
955 const nvinfer1::Dims4& ostrides) {
956 for (int n = 0; n < shape.d[0]; ++n) {
957 for (int c = 0; c < shape.d[1]; ++c) {
958 for (int h = 0; h < shape.d[2]; ++h) {
959 for (int w = 0; w < shape.d[3]; ++w) {
960 odata[n * ostrides.d[0] + c * ostrides.d[1] + h * ostrides.d[2] +
961 w * ostrides.d[3]] =
962 idata[n * istrides.d[0] + c * istrides.d[1] + h * istrides.d[2] +
963 w * istrides.d[3]];
964 }
965 }
966 }
967 }
968 }
969
970 template <typename T>
Reorder2(const nvinfer1::DimsHW & shape,const T * idata,const nvinfer1::DimsHW & istrides,T * odata,const nvinfer1::DimsHW & ostrides)971 void Reorder2(const nvinfer1::DimsHW& shape, const T* idata,
972 const nvinfer1::DimsHW& istrides, T* odata,
973 const nvinfer1::DimsHW& ostrides) {
974 for (int h = 0; h < shape.h(); ++h) {
975 for (int w = 0; w < shape.w(); ++w) {
976 odata[h * ostrides.h() + w * ostrides.w()] =
977 idata[h * istrides.h() + w * istrides.w()];
978 }
979 }
980 }
981
982 // TODO(jie): fallback to tensorflow!!
ReorderCKtoKC(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights)983 void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
984 TRT_ShapedWeights* oweights) {
985 const int c = iweights.shape_.d[0];
986 const int k = iweights.shape_.d[1];
987 oweights->shape_.d[0] = k;
988 oweights->shape_.d[1] = c;
989 const nvinfer1::DimsHW istrides = {1, k};
990 const nvinfer1::DimsHW ostrides = {c, 1};
991 switch (iweights.TrtDType()) {
992 case nvinfer1::DataType::kFLOAT: {
993 Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
994 istrides, static_cast<float*>(oweights->GetValues()), ostrides);
995 break;
996 }
997 case nvinfer1::DataType::kHALF: {
998 Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
999 istrides, static_cast<Eigen::half*>(oweights->GetValues()),
1000 ostrides);
1001 break;
1002 }
1003 default:
1004 LOG(FATAL) << "Unsupported type in reorder expected fp32 or fp16 but got "
1005 << DebugString(iweights.TrtDType());
1006 }
1007 }
1008
ReorderRSCKToKCRS(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,const int num_groups)1009 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
1010 TRT_ShapedWeights* oweights, const int num_groups) {
1011 CHECK(iweights.TrtDType() == oweights->TrtDType());
1012 CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
1013 // K indexes over output channels, C over input channels, and R and S over the
1014 // height and width of the convolution
1015 const int r = iweights.shape_.d[0];
1016 const int s = iweights.shape_.d[1];
1017 // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
1018 const int c = iweights.shape_.d[2] / num_groups;
1019 const int k = iweights.shape_.d[3] * num_groups;
1020 VLOG(2) << "num_groups: " << num_groups << "c" << iweights.shape_.d[2]
1021 << " then " << c << "k" << iweights.shape_.d[3] << " then " << k
1022 << "r" << iweights.shape_.d[0] << " then " << r << "s"
1023 << iweights.shape_.d[1] << " then " << s;
1024 oweights->shape_.d[0] = k / num_groups;
1025 oweights->shape_.d[1] = c * num_groups;
1026 oweights->shape_.d[2] = r;
1027 oweights->shape_.d[3] = s;
1028 const nvinfer1::Dims4 istrides = {1, k, s * k * c, c * k};
1029 const nvinfer1::Dims4 ostrides = {c * r * s, r * s, s, 1};
1030 switch (iweights.TrtDType()) {
1031 case nvinfer1::DataType::kFLOAT: {
1032 Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
1033 istrides, static_cast<float*>(oweights->GetValues()), ostrides);
1034 break;
1035 }
1036 case nvinfer1::DataType::kHALF: {
1037 Reorder4({k, c, r, s},
1038 static_cast<Eigen::half const*>(iweights.GetValues()), istrides,
1039 static_cast<Eigen::half*>(oweights->GetValues()), ostrides);
1040 break;
1041 }
1042
1043 default:
1044 LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
1045 << DebugString(iweights.TrtDType());
1046 }
1047 }
1048
1049 // Initialize a Dims object with arbitrary dimension
InitDimsN(std::initializer_list<int> list)1050 nvinfer1::Dims InitDimsN(std::initializer_list<int> list) {
1051 nvinfer1::Dims dim;
1052 dim.nbDims = list.size();
1053 std::copy(list.begin(), list.end(), dim.d);
1054 return dim;
1055 }
1056
1057 // Reorder 3D convolution weights from TF to TRT
ReorderDRSCKToKCDRS(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,const int num_groups)1058 void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
1059 TRT_ShapedWeights* oweights, const int num_groups) {
1060 DCHECK(iweights.TrtDType() == oweights->TrtDType());
1061 CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
1062 // K indexes over output channels, C over input channels, and R, S, D over the
1063 // height, width, depth
1064 const int d = iweights.shape_.d[0];
1065 const int r = iweights.shape_.d[1];
1066 const int s = iweights.shape_.d[2];
1067 // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
1068 const int c = iweights.shape_.d[3] / num_groups;
1069 const int k = iweights.shape_.d[4] * num_groups;
1070
1071 VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.shape_.d[3]
1072 << " becomes " << c << ", k: " << iweights.shape_.d[4] << " becomes "
1073 << k << ", d: " << d << ", r: " << r << ", s: " << s;
1074
1075 oweights->shape_.d[0] = iweights.shape_.d[4]; // k / num_groups;
1076 oweights->shape_.d[1] = iweights.shape_.d[3]; // c * num_groups;
1077 oweights->shape_.d[2] = d;
1078 oweights->shape_.d[3] = r;
1079 oweights->shape_.d[4] = s;
1080
1081 nvinfer1::Dims shape =
1082 InitDimsN({k, c, d, r, s}); // KCDRS shape (same as output)
1083
1084 nvinfer1::Dims ostrides =
1085 InitDimsN({c * d * r * s, d * r * s, r * s, s,
1086 1}); // Output = KCDRS = k*CDRS + c*DRS + d*RS + r*S + s
1087
1088 nvinfer1::Dims istrides =
1089 InitDimsN({1, k, r * s * c * k, s * c * k,
1090 c * k}); // Input = DRSCK = k*1 + c*K + d*RSCK + r*SCK + s*CK
1091
1092 switch (iweights.TrtDType()) {
1093 case nvinfer1::DataType::kFLOAT: {
1094 Reorder5(shape, static_cast<float const*>(iweights.GetValues()), istrides,
1095 static_cast<float*>(oweights->GetValues()), ostrides);
1096 break;
1097 }
1098 case nvinfer1::DataType::kHALF: {
1099 Reorder5(shape, static_cast<Eigen::half const*>(iweights.GetValues()),
1100 istrides, static_cast<Eigen::half*>(oweights->GetValues()),
1101 ostrides);
1102 break;
1103 }
1104 default:
1105 LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
1106 << DebugString(iweights.TrtDType());
1107 }
1108 }
1109
GetTempWeights(nvinfer1::DataType trt_dtype,const nvinfer1::Dims & dims)1110 TRT_ShapedWeights TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype,
1111 const nvinfer1::Dims& dims) {
1112 TensorShape shape;
1113 DataType tf_dtype;
1114 // TODO(laigd): make it return a status.
1115 TF_CHECK_OK(TensorShapeUtils::MakeShape(dims.d, dims.nbDims, &shape));
1116 TF_CHECK_OK(TrtTypeToTfType(trt_dtype, &tf_dtype));
1117 // TODO(jie): check weights size_bytes. 0 means type error
1118 Tensor tensor(tf_dtype, shape);
1119 TRT_ShapedWeights weights(trt_dtype, dims, tensor);
1120 store_.emplace_back(std::move(tensor));
1121 return weights;
1122 }
1123
OpConverterParams(const NodeDef & node_def,const std::vector<TRT_TensorOrWeights> & inputs,std::vector<TRT_TensorOrWeights> * outputs,TrtWeightStore * weight_store,TrtPrecisionMode precision_mode,bool use_calibration,bool use_implicit_batch)1124 OpConverterParams::OpConverterParams(
1125 const NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs,
1126 std::vector<TRT_TensorOrWeights>* outputs, TrtWeightStore* weight_store,
1127 TrtPrecisionMode precision_mode, bool use_calibration,
1128 bool use_implicit_batch)
1129 : node_def(node_def),
1130 inputs(inputs),
1131 outputs(outputs),
1132 validation_only(true),
1133 weight_store(weight_store),
1134 precision_mode(precision_mode),
1135 use_calibration(use_calibration),
1136 use_implicit_batch(use_implicit_batch) {}
1137
OpConverterParams(Converter * converter,const NodeDef & node_def,const std::vector<TRT_TensorOrWeights> & inputs,std::vector<TRT_TensorOrWeights> * outputs,TrtWeightStore * weight_store)1138 OpConverterParams::OpConverterParams(
1139 Converter* converter, const NodeDef& node_def,
1140 const std::vector<TRT_TensorOrWeights>& inputs,
1141 std::vector<TRT_TensorOrWeights>* outputs, TrtWeightStore* weight_store)
1142 : converter(converter),
1143 node_def(node_def),
1144 inputs(inputs),
1145 outputs(outputs),
1146 validation_only(false),
1147 weight_store(weight_store),
1148 precision_mode(converter->precision_mode()),
1149 use_calibration(converter->use_calibration()),
1150 use_implicit_batch(converter->use_implicit_batch()) {}
1151
1152 const std::set<string>* TrtNodeValidator::quantize_ops = new std::set<string>{
1153 "QuantizeAndDequantizeV2",
1154 "QuantizeAndDequantizeV3",
1155 "FakeQuantWithMinMaxVars",
1156 "FakeQuantWithMinMaxArgs",
1157 };
1158
IsQuantizeAndDequantizeOp(const Node * node)1159 bool IsQuantizeAndDequantizeOp(const Node* node) {
1160 return TrtNodeValidator::quantize_ops->count(node->def().op()) != 0;
1161 }
1162
TrtNodeValidator(const grappler::GraphProperties & graph_properties,TrtPrecisionMode precision_mode,bool use_calibration,bool use_implicit_batch)1163 TrtNodeValidator::TrtNodeValidator(
1164 const grappler::GraphProperties& graph_properties,
1165 TrtPrecisionMode precision_mode, bool use_calibration,
1166 bool use_implicit_batch)
1167 : graph_properties_(graph_properties),
1168 precision_mode_(precision_mode),
1169 use_calibration_(use_calibration),
1170 use_implicit_batch_(use_implicit_batch) {
1171 RegisterOpValidators();
1172 }
1173
ConvertToTensorOrWeights(const NodeDef & node_def,int output_port,TRT_TensorOrWeights * tensor_or_weights)1174 Status TrtNodeValidator::ConvertToTensorOrWeights(
1175 const NodeDef& node_def, int output_port,
1176 TRT_TensorOrWeights* tensor_or_weights) {
1177 if (node_def.op() == "Const") {
1178 if (output_port != 0) {
1179 return errors::InvalidArgument("Const node should only have one output.");
1180 }
1181 // The output of the conversion will be used as input to other nodes to
1182 // determine whether TRT supports those nodes. If it cannot convert the
1183 // Const, it's very likely we cannot treat it as a tensor and make it an
1184 // input to the TRT network, since TRT removes the first dimension and
1185 // treats it as batch size. Also, it's not likely that the converter can
1186 // support the op, and performance may suffer even if it can, so we just
1187 // simply return error if the conversion fails.
1188 std::vector<TRT_TensorOrWeights> inputs;
1189 return ConvertConstToWeights(node_def, inputs, tensor_or_weights);
1190 }
1191 if (!graph_properties_.HasOutputProperties(node_def.name())) {
1192 return errors::InvalidArgument("Shape and data type are unknown");
1193 }
1194
1195 // Validate and convert shape and dtype.
1196 const auto& output_params =
1197 graph_properties_.GetOutputProperties(node_def.name());
1198 const auto& tensor_properties = output_params.at(output_port);
1199 const DataType dtype = tensor_properties.dtype();
1200 const PartialTensorShape shape = tensor_properties.shape();
1201 nvinfer1::DataType trt_dtype;
1202 nvinfer1::Dims trt_dims;
1203 int batch_size = -1;
1204 TF_RETURN_IF_ERROR(ValidateTensorProperties(
1205 node_def.op(), dtype, shape, use_implicit_batch_,
1206 /*validation_only_=*/true, &trt_dtype, &trt_dims, &batch_size));
1207
1208 // Adds a fake ITensor. This is fine since op converter operates in
1209 // validation-only mode and it won't (and shouldn't) use the tensor to do
1210 // any TRT network operations.
1211 *tensor_or_weights = TRT_TensorOrWeights(trt_dtype, trt_dims, batch_size);
1212 return Status::OK();
1213 }
1214
IsTensorRTCandidate(const Node * node)1215 Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) {
1216 const string& op = node->def().op();
1217 // In INT8 mode, we will always apply the quantization ranges provided by
1218 // these ops to the relevant tensors. This happens regardless of the value of
1219 // use_calibration.
1220 bool is_supported_op = false;
1221 if (quantize_ops->count(op)) {
1222 is_supported_op = (precision_mode_ == TrtPrecisionMode::INT8);
1223 } else {
1224 is_supported_op = op_validators_.count(op);
1225 }
1226 if (!is_supported_op) {
1227 return errors::Unimplemented("Op type ", op, " is not supported.");
1228 }
1229
1230 // Convert input NodeDef and corresponding output ports to
1231 // TRT_TensorOrWeights.
1232 std::vector<TRT_TensorOrWeights> inputs;
1233 std::vector<const Edge*> input_edges;
1234 TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
1235 for (const Edge* edge : input_edges) {
1236 TRT_TensorOrWeights tensor_or_weights;
1237 const NodeDef& src_def = edge->src()->def();
1238 Status status = ConvertToTensorOrWeights(src_def, edge->src_output(),
1239 &tensor_or_weights);
1240 if (!status.ok()) {
1241 return errors::Internal(
1242 "Failed to convert input ", src_def.name(),
1243 " to a TRT_TensorOrWeights: ", status.error_message());
1244 }
1245 inputs.push_back(tensor_or_weights);
1246 }
1247
1248 OpConverter validator = op_validators_[op];
1249 OpConverterParams params(node->def(), inputs, /*arg_outputs=*/nullptr,
1250 &weight_store_, precision_mode_, use_calibration_,
1251 use_implicit_batch_);
1252 return validator(¶ms);
1253 }
1254
ConvertConstToWeights(const NodeDef & const_node_def,const std::vector<TRT_TensorOrWeights> & inputs,TRT_TensorOrWeights * output)1255 Status TrtNodeValidator::ConvertConstToWeights(
1256 const NodeDef& const_node_def,
1257 const std::vector<TRT_TensorOrWeights>& inputs,
1258 TRT_TensorOrWeights* output) {
1259 std::vector<TRT_TensorOrWeights> outputs;
1260 OpConverterParams params(const_node_def, inputs, &outputs, &weight_store_,
1261 precision_mode_, use_calibration_,
1262 use_implicit_batch_);
1263 Status status = op_validators_["Const"](¶ms);
1264 if (status.ok() && output) *output = outputs[0];
1265 return status;
1266 }
1267
1268 // static
Create(TrtPrecisionMode precision_mode,bool use_calibration,nvinfer1::ILogger * trt_logger,const bool use_implicit_batch,absl::string_view engine_name)1269 StatusOr<std::unique_ptr<Converter>> Converter::Create(
1270 TrtPrecisionMode precision_mode, bool use_calibration,
1271 nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
1272 absl::string_view engine_name) {
1273 std::unique_ptr<Converter> converter = absl::WrapUnique(
1274 new Converter(precision_mode, use_calibration, trt_logger,
1275 use_implicit_batch, engine_name));
1276 TF_RETURN_IF_ERROR(converter->Init(trt_logger));
1277 return converter;
1278 }
1279
Converter(TrtPrecisionMode precision_mode,bool use_calibration,nvinfer1::ILogger * trt_logger,const bool use_implicit_batch,absl::string_view engine_name)1280 Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration,
1281 nvinfer1::ILogger* trt_logger,
1282 const bool use_implicit_batch,
1283 absl::string_view engine_name)
1284 : precision_mode_(precision_mode),
1285 use_calibration_(use_calibration),
1286 use_implicit_batch_(use_implicit_batch),
1287 engine_name_(engine_name) {
1288 MaybeInitializeTrtPlugins(trt_logger);
1289 this->RegisterOpConverters();
1290 }
1291
Init(nvinfer1::ILogger * trt_logger)1292 Status Converter::Init(nvinfer1::ILogger* trt_logger) {
1293 VLOG(1) << "Creating TensorRT builder";
1294 trt_builder_.reset(nvinfer1::createInferBuilder(*trt_logger));
1295
1296 VLOG(1) << "Creating TensorRT network";
1297 const uint32_t flags =
1298 use_implicit_batch_
1299 ? 0U
1300 : (1U << static_cast<int>(
1301 nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
1302 trt_network_.reset(trt_builder_->createNetworkV2(flags));
1303 if (!trt_network_) {
1304 return errors::Internal("Failed to create TensorRT network object");
1305 }
1306 return Status::OK();
1307 }
1308
ConvertNode(const NodeDef & node_def)1309 Status Converter::ConvertNode(const NodeDef& node_def) {
1310 std::vector<TRT_TensorOrWeights> inputs, outputs;
1311 TF_RETURN_IF_ERROR(this->GetInputs(node_def, &inputs));
1312
1313 OpConverterParams params(this, node_def, inputs, &outputs, &weight_store_);
1314 const string& op = node_def.op();
1315 auto itr = op_registry_.find(op);
1316 if (itr == op_registry_.end()) {
1317 return errors::Unimplemented("No converter registered for op: ", op);
1318 }
1319 OpConverter op_converter = itr->second;
1320 TF_RETURN_IF_ERROR(op_converter(¶ms));
1321
1322 for (size_t i = 0; i < outputs.size(); ++i) {
1323 TRT_TensorOrWeights& output = outputs[i];
1324 string output_name = node_def.name();
1325 if (i != 0) absl::StrAppend(&output_name, ":", i);
1326 // We need to check the name before setting it. If the input is one of the
1327 // engine input, setting the name here will overwrite engine input
1328 // bindings which will cause runtime error.
1329 // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
1330 // in ConvertIdentity.
1331 if (output.is_tensor()) {
1332 const char* tensor_name = output.tensor()->getName();
1333 if (!IsEngineInput(tensor_name)) {
1334 // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
1335 // them to match their corresponding TensorFlow name.
1336 // Note: ITensors that we create internally within TF-TRT which are
1337 // not inputs or outputs of a node will not be renamed. This is a
1338 // potential cause of confusion if an error message or warning
1339 // mentions the unnamed tensor.
1340 output.tensor()->setName(output_name.c_str());
1341 }
1342 }
1343 VLOG(2) << "Adding out tensor " << output_name << ": "
1344 << output.DebugString();
1345 Status status = AddTensorOrWeights(output_name, output);
1346 if (!status.ok()) {
1347 return Status(status.code(),
1348 StrCat("Failed to add output for node ", node_def.name(),
1349 ": ", status.error_message()));
1350 }
1351 }
1352 return Status::OK();
1353 }
1354
AddInputTensor(const string & name,nvinfer1::DataType dtype,const nvinfer1::Dims & dims,int batch_size)1355 Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
1356 const nvinfer1::Dims& dims, int batch_size) {
1357 // We verify the batch size only for the input nodes, and rely on individual
1358 // op converter to ensure the batch size of the outputs is not changed.
1359 // TODO(laigd): we need to test this properties.
1360 Status status;
1361 if (use_implicit_batch_) {
1362 status = MaybeUpdateBatchSize(batch_size);
1363 if (!status.ok()) {
1364 return Status(status.code(),
1365 StrCat("Batch size doesn't match for tensor ", name, ": ",
1366 status.error_message()));
1367 }
1368 }
1369 ITensorProxyPtr tensor = network()->addInput(name.c_str(), dtype, dims);
1370 if (*tensor == nullptr) {
1371 return errors::InvalidArgument("Failed to create Input layer tensor ", name,
1372 " rank=", dims.nbDims);
1373 }
1374 status = AddTensorOrWeights(name, TRT_TensorOrWeights(tensor));
1375 if (!status.ok()) {
1376 return Status(status.code(), StrCat("Failed to add input tensor ", name,
1377 ": ", status.error_message()));
1378 }
1379 return Status::OK();
1380 }
1381
RenameAndMarkOutputTensors(const std::vector<Converter::EngineOutputInfo> & output_tensors)1382 Status Converter::RenameAndMarkOutputTensors(
1383 const std::vector<Converter::EngineOutputInfo>& output_tensors) {
1384 int output_index = 0;
1385 for (const auto& output : output_tensors) {
1386 TRT_TensorOrWeights tensor_or_weights;
1387 TF_RETURN_IF_ERROR(
1388 GetTensorOrWeights(output.source_tensor_name, &tensor_or_weights));
1389 if (!tensor_or_weights.is_tensor()) {
1390 return errors::InvalidArgument("Output ", output.source_tensor_name,
1391 " is weights not tensor");
1392 }
1393 ITensorProxyPtr tensor = tensor_or_weights.tensor();
1394 if (*tensor == nullptr) {
1395 return errors::NotFound("Output tensor not found: ",
1396 output.source_tensor_name);
1397 }
1398 // Check if this tensor has already been marked as an input or output.
1399 //
1400 // ConvertIdentity can cause the same tensor to be repeated in
1401 // output_tensors, which can cause us to overwrite the name of the output
1402 // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
1403 // we won't be able to locate OutputPH_0 during runtime. To fix this,
1404 // duplicate the tensor using no-op shuffle.
1405 //
1406 // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
1407 // in ConvertIdentity.
1408 if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) {
1409 // Using shuffle layer for identity by not setting reshape or transpose.
1410 nvinfer1::IShuffleLayer* layer =
1411 network()->addShuffle(*tensor->trt_tensor());
1412 TFTRT_RETURN_ERROR_IF_NULLPTR(
1413 layer, StrCat("Output Copy for ", tensor->getName()));
1414 SetLayerName(layer, tensor->getName(), "shuffle", output_index);
1415 tensor = layer->getOutput(0);
1416 }
1417 tensor->setName(output.dest_node_name.c_str());
1418 network()->markOutput(*tensor->trt_tensor());
1419 // Set type after marking as output. TRT only supports setType for engine
1420 // outputs and inputs (type is inferred otherwise).
1421 tensor->setType(output.trt_dtype);
1422 output_index++;
1423 VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
1424 << " with data type " << DebugString(output.trt_dtype)
1425 << ", which feeds TF node " << output.dest_node_name;
1426 }
1427 if (VLOG_IS_ON(2)) {
1428 VLOG(2) << "Created TensorRT network with the following layers:";
1429 for (int i = 0; i < network()->getNbLayers(); i++) {
1430 auto layer = network()->getLayer(i);
1431 VLOG(2) << " " << layer->getName() << " ("
1432 << "type: " << static_cast<int>(layer->getType())
1433 << ", precision: " << static_cast<int>(layer->getPrecision())
1434 << ")";
1435 }
1436 }
1437 return Status::OK();
1438 }
1439
1440 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
1441 // An algorithm selector that always returns a specific ID for selectAlgorithms.
1442 // This is used to support the implementation of using environment variable
1443 // `TF_TRT_FIXED_ALGORITHM_ID` for debugging TensorRT.
1444 class StaticAlgorithmSelector : public nvinfer1::IAlgorithmSelector {
1445 private:
1446 int32_t algorithm_id_;
1447
1448 public:
StaticAlgorithmSelector(int32_t algorithm_id)1449 StaticAlgorithmSelector(int32_t algorithm_id) : algorithm_id_(algorithm_id) {}
1450
1451 // Returns value in [0, nbChoices] for a valid algorithm.
selectAlgorithms(const nvinfer1::IAlgorithmContext & algoContext,const nvinfer1::IAlgorithm * const * algoChoices,int32_t nbChoices,int32_t * selection)1452 int32_t selectAlgorithms(const nvinfer1::IAlgorithmContext& algoContext,
1453 const nvinfer1::IAlgorithm* const* algoChoices,
1454 int32_t nbChoices,
1455 int32_t* selection) noexcept override {
1456 // TensorRT always provides more than zero number of algorithms
1457 // in selectAlgorithms.
1458 assert(nbChoices > 0);
1459
1460 // making sure that the requested TRT algorithm ID doesn't go above the
1461 // max value accepted.
1462 selection[0] = std::min(algorithm_id_, nbChoices);
1463 return 1;
1464 }
1465
1466 // Called by TensorRT to report choices it made.
reportAlgorithms(const nvinfer1::IAlgorithmContext * const * algoContexts,const nvinfer1::IAlgorithm * const * algoChoices,int32_t nbAlgorithms)1467 void reportAlgorithms(const nvinfer1::IAlgorithmContext* const* algoContexts,
1468 const nvinfer1::IAlgorithm* const* algoChoices,
1469 int32_t nbAlgorithms) noexcept override {
1470 } // do nothing
1471 };
1472 #endif
1473
BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine> * engine,int max_batch_size,size_t max_workspace_size_bytes,nvinfer1::IGpuAllocator * allocator,TRTInt8Calibrator * calibrator,TrtShapeOptimizationProfile * profiles)1474 Status Converter::BuildCudaEngine(
1475 TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, int max_batch_size,
1476 size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator,
1477 TRTInt8Calibrator* calibrator, TrtShapeOptimizationProfile* profiles) {
1478 tensorflow::profiler::AnnotatedTraceMe activity(
1479 [&]() {
1480 return tensorflow::profiler::TraceMeOpOverride("TRTEngineOp",
1481 "BuildEngine");
1482 },
1483 tensorflow::profiler::TraceMeLevel::kInfo);
1484
1485 VLOG(1) << "Configuring TensorRT builder";
1486 trt_builder_->setMaxBatchSize(max_batch_size);
1487 trt_builder_->setGpuAllocator(allocator);
1488
1489 // Create a network configuration and use it to build a TRT engine.
1490 TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(
1491 trt_builder_->createBuilderConfig());
1492 builder_config->setMaxWorkspaceSize(max_workspace_size_bytes);
1493
1494 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
1495 static int32_t trt_algorithm_id = [] {
1496 int64 trt_algorithm_id;
1497 TF_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_TRT_FIXED_ALGORITHM_ID",
1498 /*default_val=*/-1,
1499 &trt_algorithm_id));
1500 return static_cast<int32_t>(trt_algorithm_id);
1501 }();
1502
1503 if (trt_algorithm_id >= 0) {
1504 VLOG(1) << "Forcing TRT algorithm selection to: ID=" << trt_algorithm_id;
1505 StaticAlgorithmSelector trt_algorithm_selector(trt_algorithm_id);
1506 builder_config->setAlgorithmSelector(&trt_algorithm_selector);
1507 }
1508 #endif
1509
1510 #if IS_TRT_VERSION_GE(8, 0, 0, 0)
1511 builder_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
1512 VLOG(1) << "Setting sparsity for TensorRT8!";
1513 #endif
1514
1515 if (precision_mode_ == TrtPrecisionMode::FP16) {
1516 builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
1517 } else if (precision_mode_ == TrtPrecisionMode::INT8) {
1518 builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
1519 builder_config->setFlag(nvinfer1::BuilderFlag::kINT8);
1520 if (use_calibration_) {
1521 builder_config->setInt8Calibrator(calibrator);
1522 } else {
1523 builder_config->setInt8Calibrator(nullptr);
1524 }
1525 }
1526 if (!use_implicit_batch_ && profiles) {
1527 TF_RETURN_IF_ERROR(profiles->ConfigureBuilder(
1528 trt_builder_.get(), builder_config.get(), network()));
1529 }
1530
1531 string precision_mode_str;
1532 TF_RETURN_IF_ERROR(
1533 TrtPrecisionModeToName(precision_mode_, &precision_mode_str));
1534 string trt_network_name = StrCat(
1535 "TF:", TF_VERSION_STRING, ", ",
1536 "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-",
1537 "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_,
1538 ", ", "Max-Batch-Size:", max_batch_size, ", ",
1539 "Max-Workspace-Size:", max_workspace_size_bytes);
1540 VLOG(1) << "Setting TensorRT network name to " << trt_network_name;
1541 network()->setName(trt_network_name.c_str());
1542
1543 VLOG(1) << "Building TensorRT engine";
1544 if (VLOG_IS_ON(2)) {
1545 VLOG(2) << "Network inputs";
1546 int n_inputs = network()->getNbInputs();
1547 for (int i = 0; i < n_inputs; i++) {
1548 const ITensorProxyPtr input = network()->getInput(i);
1549 if (*input) {
1550 VLOG(2) << " " << i << " " << input->getName();
1551 } else {
1552 VLOG(2) << "Could not find input " << i;
1553 }
1554 }
1555 }
1556 engine->reset(
1557 trt_builder_->buildEngineWithConfig(*network(), *builder_config));
1558 if (engine->get() == nullptr) {
1559 return errors::Internal("Failed to build TensorRT engine");
1560 }
1561 if (VLOG_IS_ON(2)) {
1562 VLOG(2) << "TRT engine created";
1563 int nbBindings = (*engine)->getNbBindings();
1564 VLOG(2) << "Number of engine bindings: " << nbBindings;
1565 for (int i = 0; i < nbBindings; i++) {
1566 VLOG(2) << "Binding " << i << " name: " << (*engine)->getBindingName(i);
1567 }
1568 }
1569 return Status::OK();
1570 }
1571
MaybeUpdateBatchSize(int batch_size)1572 Status Converter::MaybeUpdateBatchSize(int batch_size) {
1573 // OK iff either is unknown or they equal to each other.
1574 if (this->batch_size_ < 0 || batch_size < 0 ||
1575 this->batch_size_ == batch_size) {
1576 if (this->batch_size_ < 0 && batch_size >= 0) {
1577 this->batch_size_ = batch_size;
1578 }
1579 return Status::OK();
1580 }
1581 return errors::InvalidArgument(
1582 "Provided batch size does not match converter batch size: ", batch_size,
1583 " vs ", batch_size_);
1584 }
1585
AddTensorOrWeights(const string & name,TRT_TensorOrWeights input)1586 Status Converter::AddTensorOrWeights(const string& name,
1587 TRT_TensorOrWeights input) {
1588 // Set the batch size of the tensor, using batch size collected from the
1589 // input tensors to the TRT subgraph at the beginning of the conversion.
1590 // We rely on the individual op converter to understand the semantics of the
1591 // TF node, and make sure it doesn't change the batch size nor introduce
1592 // intra-element dependency inside the batch.
1593 if (use_implicit_batch_ && input.is_tensor()) {
1594 input.set_batch_size(batch_size_);
1595 }
1596 if (trt_tensors_.insert({name, std::move(input)}).second) return Status::OK();
1597 return errors::AlreadyExists("tensor/weights ", name, " already exist.");
1598 }
1599
GetTensorOrWeights(const string & name,TRT_TensorOrWeights * output)1600 Status Converter::GetTensorOrWeights(const string& name,
1601 TRT_TensorOrWeights* output) {
1602 if (!trt_tensors_.count(name)) {
1603 return errors::NotFound("Tensor or weights with name ", name,
1604 " could not be found.");
1605 }
1606 *output = trt_tensors_.at(name);
1607 return Status::OK();
1608 }
1609
TransposeTensor(ITensorProxyPtr input_tensor,const std::vector<int> & order_with_batch_dim,ITensorProxyPtr * output_tensor,const NodeDef & node_def,absl::string_view sub_op_name)1610 Status Converter::TransposeTensor(ITensorProxyPtr input_tensor,
1611 const std::vector<int>& order_with_batch_dim,
1612 ITensorProxyPtr* output_tensor,
1613 const NodeDef& node_def,
1614 absl::string_view sub_op_name) {
1615 const auto dims = input_tensor->getDimensions();
1616 const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
1617 : order_with_batch_dim.size();
1618 if (order_size != size_t(dims.nbDims)) {
1619 return errors::InvalidArgument(
1620 "Rank of perm for transpose does not match with that of the input.");
1621 }
1622 if (use_implicit_batch_ && order_with_batch_dim[0] != 0) {
1623 return errors::Unimplemented(
1624 "Transpose at batch dimension is not supported.");
1625 }
1626
1627 nvinfer1::IShuffleLayer* layer =
1628 this->network()->addShuffle(*input_tensor->trt_tensor());
1629 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
1630 SetLayerName(layer, node_def, sub_op_name);
1631
1632 nvinfer1::Permutation permutation;
1633 if (use_implicit_batch_) {
1634 for (int32_t i = 0; i < dims.nbDims; ++i) {
1635 permutation.order[i] = order_with_batch_dim[i + 1] - 1;
1636 }
1637 } else {
1638 std::copy(order_with_batch_dim.begin(), order_with_batch_dim.end(),
1639 permutation.order);
1640 }
1641 VLOG(1) << "TransposeTensor permutation: "
1642 << DebugString(permutation, dims.nbDims);
1643 layer->setFirstTranspose(permutation);
1644
1645 nvinfer1::Dims reshape_dims;
1646 reshape_dims.nbDims = dims.nbDims;
1647 for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
1648 reshape_dims.d[i] = 0;
1649 }
1650 layer->setReshapeDimensions(reshape_dims);
1651
1652 *output_tensor = layer->getOutput(0);
1653 return Status::OK();
1654 }
1655
GetWeightRange(const TRT_ShapedWeights & weights,float * out_min,float * out_max) const1656 Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
1657 float* out_min, float* out_max) const {
1658 switch (weights.TrtDType()) {
1659 case nvinfer1::DataType::kFLOAT: {
1660 auto inp = static_cast<float const*>(weights.GetValues());
1661 auto result = std::minmax_element(inp, inp + weights.count());
1662 *out_min = *result.first;
1663 *out_max = *result.second;
1664 break;
1665 }
1666 case nvinfer1::DataType::kHALF: {
1667 auto inp = static_cast<Eigen::half const*>(weights.GetValues());
1668 auto result = std::minmax_element(inp, inp + weights.count());
1669 *out_min = static_cast<float>(*result.first);
1670 *out_max = static_cast<float>(*result.second);
1671 break;
1672 }
1673 case nvinfer1::DataType::kINT32: {
1674 auto inp = static_cast<int const*>(weights.GetValues());
1675 auto result = std::minmax_element(inp, inp + weights.count());
1676 *out_min = static_cast<float>(*result.first);
1677 *out_max = static_cast<float>(*result.second);
1678 break;
1679 }
1680 default:
1681 return errors::Unimplemented(
1682 "Data type not supported for GetWeightRange: ",
1683 DebugString(weights.TrtDType()));
1684 }
1685 return Status::OK();
1686 }
1687
1688 // Constructs <tf_related_part> for the ILayer name as
1689 // <tf_node_def_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper
1690 // to set the name for the ILayer.
1691 //
1692 // If the operation represented by the ILayer is generated by the converter to
1693 // support the conversion of node_def, callers need to specify a non-empty
1694 // sub_op_name to be appended to the name of node_def to avoid layer name
1695 // conflicts. If the operation is generated multiple times, callers also need
1696 // to specify sub_op_instance to be appended to the name of the layers to avoid
1697 // layer name conflicts.
SetLayerName(nvinfer1::ILayer * layer,const NodeDef & node_def,absl::string_view sub_op_name,absl::optional<int> sub_op_instance,absl::optional<std::string> origin_node_name)1698 void Converter::SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
1699 absl::string_view sub_op_name,
1700 absl::optional<int> sub_op_instance,
1701 absl::optional<std::string> origin_node_name) {
1702 std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance);
1703 if (sub_op_suffix.empty()) {
1704 SetLayerNameHelper(layer, engine_name_, node_def.name());
1705 } else if (origin_node_name.has_value()) {
1706 SetLayerNameHelper(layer, engine_name_,
1707 absl::StrCat(node_def.name(), "-",
1708 absl::string_view(origin_node_name.value()),
1709 "-", sub_op_suffix));
1710 } else {
1711 SetLayerNameHelper(layer, engine_name_,
1712 absl::StrCat(node_def.name(), "-", sub_op_suffix));
1713 }
1714 }
1715
1716 // Constructs <tf_related_part> for the ILayer name as
1717 // <main_op_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper to
1718 // set the name for the ILayer.
SetLayerName(nvinfer1::ILayer * layer,absl::string_view main_op_name,absl::string_view sub_op_name,absl::optional<int> sub_op_instance)1719 void Converter::SetLayerName(nvinfer1::ILayer* layer,
1720 absl::string_view main_op_name,
1721 absl::string_view sub_op_name,
1722 absl::optional<int> sub_op_instance) {
1723 std::string layer_name_suffix =
1724 GetLayerNameSuffix(sub_op_name, sub_op_instance);
1725 SetLayerNameHelper(layer, engine_name_,
1726 absl::StrCat(main_op_name, "-", layer_name_suffix));
1727 }
1728
1729 // Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
1730 // (which doesn't contain the batch dimension).
1731 //
1732 // If validation_only is true, it doesn't do the conversion but only do some
1733 // minimum validation for the eligibility of the conversion, and *tensor will
1734 // be set to nullptr.
PrepareTensorForShape(Converter * converter,const TRT_TensorOrWeights & input,const nvinfer1::Dims & dims,const bool validation_only,ITensorProxyPtr * tensor,const NodeDef & node_def,absl::optional<int> op_instance,absl::optional<std::string> origin_node_name)1735 Status PrepareTensorForShape(Converter* converter,
1736 const TRT_TensorOrWeights& input,
1737 const nvinfer1::Dims& dims,
1738 const bool validation_only,
1739 ITensorProxyPtr* tensor, const NodeDef& node_def,
1740 absl::optional<int> op_instance,
1741 absl::optional<std::string> origin_node_name) {
1742 const nvinfer1::Dims input_dims = input.GetTrtDims();
1743 // The input shape may have -1s for dynamic shape. The target shape may have
1744 // 0s representing copy over the corresponding input dimensions. It may also
1745 // have at most one -1 representing a dimension value that needs to be
1746 // inferred. If none of those special values present, we verify that the total
1747 // sizes of the input and output shape are the same.
1748 // TODO(tfeher): Verify that the total sizes of the input and output shape are
1749 // the same in the present of 0s but no -1 in the target shape.
1750 // If an input is a weight, it is going to become a tensor via
1751 // CreateConstantLayer. So we can treat it as a tensor for
1752 // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors.
1753 if (Prod(dims) > 0 && AreDimsStaticWithDifferentSize(input_dims, dims)) {
1754 return errors::InvalidArgument(
1755 "Incompatible shapes: ", DebugString(input_dims), " vs. ",
1756 DebugString(dims));
1757 }
1758 // ConstantLayer requires static shapes (cannot infer -1).
1759 if (input.is_weights() && !HasStaticShape(dims)) {
1760 return errors::InvalidArgument("Shape is not fully defined: ",
1761 DebugString(dims));
1762 }
1763 if (validation_only) {
1764 *tensor = nullptr;
1765 return Status::OK();
1766 }
1767
1768 TFTRT_RETURN_ERROR_IF_NULLPTR(converter, "converter is nullptr");
1769 if (input.is_tensor()) {
1770 if (DimsEqual(input_dims, dims)) {
1771 *tensor = input.tensor();
1772 } else {
1773 nvinfer1::IShuffleLayer* layer =
1774 converter->network()->addShuffle(*input.tensor()->trt_tensor());
1775 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
1776 converter->SetLayerName(layer, node_def, "shuffle", op_instance,
1777 origin_node_name);
1778 layer->setReshapeDimensions(dims);
1779 *tensor = layer->getOutput(0);
1780 }
1781 } else {
1782 *tensor = converter->CreateConstantLayer(input.weights(), dims);
1783 TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
1784 }
1785 return Status::OK();
1786 }
1787
ProvideQuantizationRange(ITensorProxyPtr * tensor,float min_range,float max_range)1788 void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor,
1789 float min_range, float max_range) {
1790 float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
1791 if ((*tensor)->is_trt_tensor()) {
1792 quantization_ranges_[(*tensor)->trt_tensor()] = symmetric_range;
1793 } else if ((*tensor)->is_simple_tensor()) {
1794 quantization_ranges_proxy_[tensor] = symmetric_range;
1795 }
1796 }
1797
1798 namespace {
1799
IsConvolution(const nvinfer1::ILayer * layer)1800 bool IsConvolution(const nvinfer1::ILayer* layer) {
1801 return layer->getType() == nvinfer1::LayerType::kCONVOLUTION;
1802 }
1803
IsScale(const nvinfer1::ILayer * layer)1804 bool IsScale(const nvinfer1::ILayer* layer) {
1805 return layer->getType() == nvinfer1::LayerType::kSCALE;
1806 }
1807
IsClipOrRelu(const nvinfer1::ILayer * layer)1808 bool IsClipOrRelu(const nvinfer1::ILayer* layer) {
1809 if (layer->getType() != nvinfer1::LayerType::kACTIVATION) {
1810 return false;
1811 }
1812 auto activation_type = static_cast<const nvinfer1::IActivationLayer*>(layer)
1813 ->getActivationType();
1814
1815 return activation_type == nvinfer1::ActivationType::kRELU ||
1816 activation_type == nvinfer1::ActivationType::kCLIP;
1817 }
1818
IsAdd(const nvinfer1::ILayer * layer)1819 bool IsAdd(const nvinfer1::ILayer* layer) {
1820 if (layer->getType() != nvinfer1::LayerType::kELEMENTWISE) {
1821 return false;
1822 }
1823 auto operation =
1824 static_cast<const nvinfer1::IElementWiseLayer*>(layer)->getOperation();
1825 return operation == nvinfer1::ElementWiseOperation::kSUM;
1826 }
1827
1828 } // namespace
1829
MaybeApplyQuantizationRanges()1830 void Converter::MaybeApplyQuantizationRanges() {
1831 if (precision_mode() != TrtPrecisionMode::INT8) return;
1832
1833 // Apply ranges.
1834 for (auto pair : quantization_ranges_) {
1835 nvinfer1::ITensor* tensor = pair.first;
1836 const float range = pair.second;
1837 VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
1838 // TODO(laigd): if 'tensor' already has a range set which doesn't match
1839 // 'range', it should report error.
1840 tensor->setDynamicRange(-range, range);
1841 }
1842 for (auto pair : quantization_ranges_proxy_) {
1843 ITensorProxyPtr tensor = *pair.first;
1844 const float range = pair.second;
1845 VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
1846 // TODO(laigd): if 'tensor' already has a range set which doesn't match
1847 // 'range', it should report error.
1848 tensor->setDynamicRange(-range, range);
1849 }
1850 }
1851
GetInputs(const NodeDef & node_def,std::vector<TRT_TensorOrWeights> * inputs) const1852 Status Converter::GetInputs(const NodeDef& node_def,
1853 std::vector<TRT_TensorOrWeights>* inputs) const {
1854 for (auto const& input_name : node_def.input()) {
1855 /*************************************************************************
1856 * TODO(jie): handle case 1) here.
1857 * Normalizes the inputs and extracts associated metadata:
1858 * 1) Inputs can contain a colon followed by a suffix of characters.
1859 * That suffix may be a single number (e.g. inputName:1) or several
1860 * word characters separated from a number by a colon
1861 * (e.g. inputName:foo:1). The
1862 * latter case is used to denote inputs and outputs of functions.
1863 * 2) Control dependency inputs contain caret at the beginning and we
1864 * remove this and annotate the edge as a control dependency.
1865 ************************************************************************/
1866 // skip control nodes
1867 if (input_name[0] == '^') continue;
1868 string name = input_name;
1869 auto last = name.find_last_of(':');
1870 // TODO(aaroey): use TensorId
1871 if (last != string::npos && last + 2 == name.size() &&
1872 name[last + 1] == '0') {
1873 name.erase(last);
1874 }
1875
1876 if (trt_tensors_.count(name)) {
1877 TRT_TensorOrWeights input = trt_tensors_.at(name);
1878 inputs->push_back(input);
1879 VLOG(2) << "Retrieved input " << name << ": " << input.DebugString();
1880 } else {
1881 // TODO(aaroey): this should not happen, make it a CHECK.
1882 // TODO(aaroey): use StrCat for pattern like this.
1883 string msg("Node ");
1884 StrAppend(&msg, node_def.name(), " should have an input named '", name,
1885 "' but it is not available");
1886 LOG(ERROR) << msg;
1887 return errors::InvalidArgument(msg);
1888 }
1889 }
1890 return Status::OK();
1891 }
1892
1893 enum class TrtInputArg { kTensor = 1, kWeight = 2, kBoth = 3 };
1894
1895 // Checks that the number of inputs match, and enforces that the inputs marked
1896 // as weights are constant. Inputs are allowed to be both weight and tensor.
CheckInputsWeights(const OpConverterParams & params,const std::vector<std::pair<string,TrtInputArg>> & expected_inputs)1897 Status CheckInputsWeights(
1898 const OpConverterParams& params,
1899 const std::vector<std::pair<string, TrtInputArg>>& expected_inputs) {
1900 const auto& inputs = params.inputs;
1901 const auto& node_def = params.node_def;
1902 if (inputs.size() != expected_inputs.size()) {
1903 return errors::InvalidArgument(
1904 node_def.op(), " got ", inputs.size(), " inputs but expected ",
1905 expected_inputs.size(), ", at ", node_def.name());
1906 }
1907 for (int i = 0; i < inputs.size(); i++) {
1908 if (expected_inputs[i].second == TrtInputArg::kWeight &&
1909 inputs.at(i).is_tensor()) {
1910 return errors::Unimplemented("The input \"", expected_inputs[i].first,
1911 "\" for ", node_def.op(),
1912 " must be a constant, at ", node_def.name());
1913 }
1914 // TODO(tfeher): Remove this check and provide a method to automatically
1915 // retrieve an input as a tensor, converting via CreateConstantLayer if it
1916 // was originally a weight. We will want a caching mechanism to prevent many
1917 // duplicate constants from being created.
1918 if (expected_inputs[i].second == TrtInputArg::kTensor &&
1919 inputs.at(i).is_weights()) {
1920 return errors::Unimplemented("The input \"", expected_inputs[i].first,
1921 "\" for ", node_def.op(),
1922 " must be a tensor, at ", node_def.name());
1923 }
1924 }
1925 return Status::OK();
1926 }
1927
1928 // Checks that the number of inputs match, and enforces that the inputs marked
1929 // as true are constant weights. true means that the input must be a weight,
1930 // while false means the input must be a tensor.
CheckInputsWeights(const OpConverterParams & params,const std::vector<std::pair<string,bool>> & inputs_is_weight)1931 Status CheckInputsWeights(
1932 const OpConverterParams& params,
1933 const std::vector<std::pair<string, bool>>& inputs_is_weight) {
1934 std::vector<std::pair<string, TrtInputArg>> expected_inputs;
1935 expected_inputs.reserve(inputs_is_weight.size());
1936 std::transform(
1937 inputs_is_weight.begin(), inputs_is_weight.end(),
1938 std::back_inserter(expected_inputs), [](std::pair<string, bool> x) {
1939 return std::make_pair(
1940 x.first, x.second ? TrtInputArg::kWeight : TrtInputArg::kTensor);
1941 });
1942 return CheckInputsWeights(params, expected_inputs);
1943 }
1944
GetNodeDefTfType(const NodeDef & node_def,DataType * tf_type,const char * type_attr_name)1945 Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
1946 const char* type_attr_name) {
1947 TFAttrs attrs(node_def);
1948 if (!attrs.count(type_attr_name)) {
1949 return errors::InvalidArgument("Attribute with name ", type_attr_name,
1950 " not found.");
1951 }
1952 *tf_type = attrs.get<DataType>(type_attr_name);
1953 return Status::OK();
1954 }
1955
GetInputTfType(const OpConverterParams & params,DataType * tf_type,int pos)1956 Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
1957 int pos) {
1958 const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
1959 if (inputs.size() <= pos) {
1960 return errors::Internal("Invalid input position");
1961 }
1962
1963 return inputs[pos].GetTfType(tf_type);
1964 }
1965
1966 constexpr const char kOutputTypeAttrName[] = "T";
1967
GetOutputTfType(const OpConverterParams & params,DataType * tf_type)1968 Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
1969 return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
1970 }
1971
AllowDataTypes(const OpConverterParams & params,const std::set<DataType> & allowed_types,const char * type_attr_name=kOutputTypeAttrName)1972 Status AllowDataTypes(const OpConverterParams& params,
1973 const std::set<DataType>& allowed_types,
1974 const char* type_attr_name = kOutputTypeAttrName) {
1975 const auto& node_def = params.node_def;
1976 DataType tf_type;
1977 TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
1978 if (!allowed_types.count(tf_type)) {
1979 string allowed_types_string = absl::StrJoin(
1980 allowed_types, ", ", [](string* out, const DataType& type) {
1981 absl::StrAppendFormat(out, "%s", DataTypeString(type));
1982 });
1983 return errors::Unimplemented("Data type ", DataTypeString(tf_type),
1984 " is not supported for ", node_def.op(),
1985 ", must be one of [", allowed_types_string,
1986 "], at ", node_def.name());
1987 }
1988 return Status::OK();
1989 }
1990
1991 // ****************************************************************************
1992 // Constant folding functions for weights.
1993 // TODO(laigd): we should probably use eigen directly.
1994 // *****************************************************************************
1995 struct LambdaFactory {
1996 enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
1997 OP_CATEGORY op;
1998
1999 template <typename T>
unarytensorflow::tensorrt::convert::LambdaFactory2000 std::function<T(T)> unary() {
2001 switch (op) {
2002 case OP_CATEGORY::RSQRT: {
2003 VLOG(2) << "RSQRT GETS DONE";
2004 return [](T t) -> T { return 1.0 / std::sqrt(t); };
2005 }
2006 case OP_CATEGORY::NEG:
2007 return [](T t) -> T { return -t; };
2008 case OP_CATEGORY::RECIP:
2009 return [](T t) -> T { return 1.0 / t; };
2010 default:
2011 LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
2012 return nullptr;
2013 }
2014 }
2015 };
2016
2017 template <>
unary()2018 std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
2019 switch (op) {
2020 case OP_CATEGORY::RSQRT: {
2021 VLOG(2) << "RSQRT GETS DONE";
2022 return [](Eigen::half t) {
2023 return Eigen::half(1.0 / std::sqrt(static_cast<float>(t)));
2024 };
2025 }
2026 case OP_CATEGORY::NEG:
2027 return [](Eigen::half t) { return -t; };
2028 case OP_CATEGORY::RECIP:
2029 return [](Eigen::half t) {
2030 return Eigen::half(1.0 / static_cast<float>(t));
2031 };
2032 default:
2033 LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
2034 return nullptr;
2035 }
2036 }
2037
UnaryCompute(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,LambdaFactory unary_op)2038 Status UnaryCompute(const TRT_ShapedWeights& iweights,
2039 TRT_ShapedWeights* oweights, LambdaFactory unary_op) {
2040 CHECK(iweights.TrtDType() == oweights->TrtDType());
2041 switch (iweights.TrtDType()) {
2042 case nvinfer1::DataType::kFLOAT: {
2043 auto inp = static_cast<float const*>(iweights.GetValues());
2044 auto oup = static_cast<float*>(oweights->GetValues());
2045 std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
2046 break;
2047 }
2048 case nvinfer1::DataType::kHALF: {
2049 auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
2050 auto oup = static_cast<Eigen::half*>(oweights->GetValues());
2051 std::transform(inp, inp + iweights.count(), oup,
2052 unary_op.unary<Eigen::half>());
2053 break;
2054 }
2055 default:
2056 return errors::Unimplemented("Data type not supported: ",
2057 DebugString(iweights.TrtDType()));
2058 }
2059 return Status::OK();
2060 }
2061
Conv2DPaddingHelper(OpConverterParams * params,const TFAttrs & attrs,const nvinfer1::DimsHW & kernel_size,const nvinfer1::DimsHW & dilation,const nvinfer1::DimsHW & stride,const std::vector<int64_t> & input_dims,ITensorProxyPtr tensor,std::vector<std::pair<int,int>> * padding,ITensorProxyPtr * padded_tensor)2062 Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
2063 const nvinfer1::DimsHW& kernel_size,
2064 const nvinfer1::DimsHW& dilation,
2065 const nvinfer1::DimsHW& stride,
2066 const std::vector<int64_t>& input_dims,
2067 ITensorProxyPtr tensor,
2068 std::vector<std::pair<int, int>>* padding,
2069 ITensorProxyPtr* padded_tensor) {
2070 if (attrs.get<string>("padding") == "SAME") {
2071 nvinfer1::DimsHW effective_kernel_size = kernel_size;
2072 effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
2073 effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
2074 *padding = CreateSamePadding(stride, effective_kernel_size, input_dims);
2075 } else {
2076 *padding = {{0, 0}, {0, 0}};
2077 }
2078
2079 if ((*padding)[0].first != (*padding)[0].second ||
2080 (*padding)[1].first != (*padding)[1].second) {
2081 auto pad_layer = params->converter->network()->addPadding(
2082 *tensor->trt_tensor(),
2083 nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first),
2084 nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second));
2085 TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name());
2086 params->converter->SetLayerName(pad_layer, params->node_def, "pad");
2087 tensor = pad_layer->getOutput(0);
2088 *padding = {{0, 0}, {0, 0}};
2089 }
2090 *padded_tensor = tensor;
2091 return Status::OK();
2092 }
2093
2094 namespace {
2095 // Extracts the spatial dimensions from `output_sizes` and returns them as a
2096 // vector of size 2.
GetSpatialDimsFromOutputSizes(const TRT_TensorOrWeights & output_sizes,const int h_index,const int w_index)2097 std::vector<int64_t> GetSpatialDimsFromOutputSizes(
2098 const TRT_TensorOrWeights& output_sizes, const int h_index,
2099 const int w_index) {
2100 // We use h_index and w_index instead of 1 and 2 because we haven't
2101 // transposed output_sizes along with the input.
2102 const TRT_ShapedWeights& weights = output_sizes.weights();
2103 const int output_sizes_length = weights.count();
2104 auto output_sizes_values = static_cast<int*>(weights.GetValues());
2105 // The length of output_sizes can be 2 or 4. When the length is 4,
2106 // output_sizes represents <height,width>.
2107 return {output_sizes_values[output_sizes_length == 4 ? h_index : 0],
2108 output_sizes_values[output_sizes_length == 4 ? w_index : 1]};
2109 }
2110 } // namespace
2111
ConvertConv2DHelper(OpConverterParams * params,int group,bool is_conv2d_backprop_input)2112 Status ConvertConv2DHelper(OpConverterParams* params, int group,
2113 bool is_conv2d_backprop_input) {
2114 const auto& inputs = params->inputs;
2115 const auto& node_def = params->node_def;
2116 TRT_TensorOrWeights backprop_output_size;
2117 ITensorProxyPtr tensor = nullptr;
2118 if (is_conv2d_backprop_input) {
2119 // In the case when Conv2dBackpropInput is used for conv2d_transpose, these
2120 // inputs correspond to: output size, filter, and input.
2121 TF_RETURN_IF_ERROR(CheckInputsWeights(
2122 *params,
2123 {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
2124 backprop_output_size = inputs.at(0);
2125 tensor = inputs.at(2).tensor();
2126 if (!HasStaticShape(tensor->getDimensions())) {
2127 // TODO(tfeher): Allow dynamic input. We need to implement padding
2128 // correction for dynamic shapes in this case.
2129 return errors::Unimplemented(
2130 "Conv2dBackpropInput does not support input with unknown shape, at ",
2131 node_def.name());
2132 }
2133 } else {
2134 TF_RETURN_IF_ERROR(
2135 CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
2136 tensor = inputs.at(0).tensor();
2137 }
2138 TF_RETURN_IF_ERROR(
2139 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
2140 TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
2141 if (weights_rsck.shape_.nbDims != 4) {
2142 return errors::InvalidArgument("Conv2D expects kernel of dimension 4, at " +
2143 node_def.name());
2144 }
2145 TFAttrs attrs(node_def);
2146 auto data_format = attrs.get<string>("data_format");
2147 int c_index = (data_format == "NHWC") ? 3 : 1;
2148 int h_index = (data_format == "NHWC") ? 1 : 2;
2149 int w_index = (data_format == "NHWC") ? 2 : 3;
2150 auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
2151 if (tf_dilations.size() != 4) {
2152 return errors::InvalidArgument(
2153 "Convolution dilations field must specify 4 dimensions, at ",
2154 node_def.name());
2155 }
2156 if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
2157 return errors::Unimplemented(
2158 "Dilation rate must be 1 for batch and channel dimensions, at ",
2159 node_def.name());
2160 }
2161 const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
2162 if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) {
2163 return errors::Unimplemented(
2164 "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported",
2165 ", at ", node_def.name());
2166 }
2167
2168 const auto tf_stride = attrs.get<std::vector<int64>>("strides");
2169 if (tf_stride.size() != 4) {
2170 return errors::InvalidArgument(
2171 "Convolution strides field must specify 4 dimensions, at ",
2172 node_def.name());
2173 }
2174 if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
2175 return errors::Unimplemented(
2176 "Stride must be 1 for batch and channel dimensions, at ",
2177 node_def.name());
2178 }
2179 // Channel dim must be static for DepthwiseConv2dNative since we use that
2180 // value for num_groups at build time.
2181 if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) {
2182 return errors::InvalidArgument("Channel dimension must be static, at ",
2183 node_def.name());
2184 }
2185 string padding = attrs.get<string>("padding");
2186 if (padding != "SAME" && padding != "VALID") {
2187 return errors::Unimplemented(padding +
2188 " padding type not implemented, "
2189 "only VALID and SAME are supported");
2190 }
2191 const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
2192 if (params->validation_only) return Status::OK();
2193
2194 // Transpose to NCHW (NCHW is required for IConvLayer).
2195 const bool need_transpose = (data_format == "NHWC");
2196 if (need_transpose) {
2197 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2198 tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
2199 }
2200 // Dimensions of transposed tensor.
2201 const auto tensor_dim = tensor->getDimensions();
2202 const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1];
2203
2204 // group == 0 signifies that this is a depthwise convolution, so set
2205 // num_groups to size of input's channel dim. For a non-depthwise conv,
2206 // num_groups will be 1.
2207 const int num_groups = (group == 0) ? c_dim_size : group;
2208
2209 // For conv, TF weights are RSCK, and TRT expects KCRS.
2210 // For backprop, TF weights are RSKC, and TRT expects CKRS.
2211 // Therefore, this reorder will work for both cases.
2212 TRT_ShapedWeights weights =
2213 params->weight_store->GetTempWeights(weights_rsck);
2214 ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
2215 TRT_ShapedWeights biases(weights.TrtDType());
2216 const int output_axis = is_conv2d_backprop_input ? 1 : 0;
2217 const int noutput = weights.shape_.d[output_axis] * num_groups;
2218 nvinfer1::DimsHW kernel_size;
2219 kernel_size.h() = weights.shape_.d[2];
2220 kernel_size.w() = weights.shape_.d[3];
2221
2222 // Add convolution.
2223 nvinfer1::ILayer* conv_layer = nullptr;
2224 if (is_conv2d_backprop_input) {
2225 nvinfer1::IDeconvolutionLayer* layer =
2226 params->converter->network()->addDeconvolution(
2227 *tensor->trt_tensor(), noutput, kernel_size,
2228 weights.GetTrtWeights(), biases.GetTrtWeights());
2229 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
2230 layer->setStride(stride);
2231 // VALID padding is the default TRT behavior.
2232 if (attrs.get<string>("padding") == "SAME") {
2233 // SAME_UPPER means that post padding is preferred.
2234 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
2235 }
2236 layer->setNbGroups(num_groups);
2237 conv_layer = layer;
2238 } else {
2239 nvinfer1::IConvolutionLayer* layer =
2240 params->converter->network()->addConvolution(
2241 *tensor->trt_tensor(), noutput, kernel_size,
2242 weights.GetTrtWeights(), biases.GetTrtWeights());
2243 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
2244 layer->setStride(stride);
2245 if (attrs.get<string>("padding") == "SAME") {
2246 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
2247 }
2248 layer->setNbGroups(num_groups);
2249 layer->setDilation(dilation);
2250 conv_layer = layer;
2251 }
2252 params->converter->SetLayerName(conv_layer, node_def, "conv");
2253 ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
2254 // Add an extra padding for Deconv because TRT doesn't accept the
2255 // argument output_shape and thus the TRT output shape could be wrong
2256 // in case of strides>1.
2257 if (is_conv2d_backprop_input) {
2258 std::vector<int64_t> output_spatial_dims =
2259 GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index);
2260 const int output_height = output_spatial_dims[0];
2261 const int output_width = output_spatial_dims[1];
2262 nvinfer1::Dims trt_output_shape = output_tensor->getDimensions();
2263 // What determines the padding size is the difference between the given
2264 // input_sizes (tf_output_shape) and TRT computed size.
2265 int out_h_idx = params->use_implicit_batch ? 1 : 2;
2266 int out_w_idx = params->use_implicit_batch ? 2 : 3;
2267 const int height_diff = output_height - trt_output_shape.d[out_h_idx];
2268 const int width_diff = output_width - trt_output_shape.d[out_w_idx];
2269 if ((height_diff < 0) || (width_diff < 0)) {
2270 return errors::InvalidArgument(
2271 "input_sizes argument of Conv2DBackprop (i.e. output_shape argument "
2272 "of conv2d_transpose) ",
2273 "is too small for the given out_backprop argument of Conv2DBackprop "
2274 "(i.e. input argument of conv2d_transpose). Expect: ",
2275 "(", output_height, ", ", output_width, ") >= ", "(",
2276 trt_output_shape.d[out_h_idx], ", ", trt_output_shape.d[out_w_idx],
2277 ") for op ", node_def.name());
2278 }
2279 // Only add a padding layer if padding sizes are larger than 0
2280 if ((height_diff > 0) || (width_diff > 0)) {
2281 nvinfer1::DimsHW pre_padding(0, 0);
2282 nvinfer1::DimsHW post_padding(height_diff, width_diff);
2283 nvinfer1::IPaddingLayer* padding_layer =
2284 params->converter->network()->addPadding(*output_tensor->trt_tensor(),
2285 pre_padding, post_padding);
2286 output_tensor = padding_layer->getOutput(0);
2287 params->converter->SetLayerName(padding_layer, node_def, "pad");
2288 }
2289 }
2290 // Restore transpose.
2291 if (need_transpose) {
2292 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2293 output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
2294 }
2295 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2296 return Status::OK();
2297 }
2298
AllowInefficientTranspose()2299 bool AllowInefficientTranspose() {
2300 static bool result = [] {
2301 bool value;
2302 Status status =
2303 ReadBoolFromEnvVar("TF_DEBUG_TRT_ALLOW_INEFFICIENT_TRANSPOSE",
2304 /*default_value=*/false, &value);
2305 if (!status.ok()) {
2306 LOG(ERROR) << status;
2307 }
2308 return value;
2309 }();
2310
2311 return result;
2312 }
2313
ConvertTranspose(OpConverterParams * params)2314 Status ConvertTranspose(OpConverterParams* params) {
2315 const auto& inputs = params->inputs;
2316 TF_RETURN_IF_ERROR(
2317 CheckInputsWeights(*params, {{"x", false}, {"perm", true}}));
2318 TF_RETURN_IF_ERROR(AllowDataTypes(
2319 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2320 // Get the permutation from weights.
2321 TRT_ShapedWeights weights = inputs.at(1).weights();
2322 const int* weights_ptr = static_cast<int*>(weights.GetValues());
2323 std::vector<int> perm(weights_ptr, weights_ptr + weights.count());
2324
2325 // Verify the permutation.
2326 ITensorProxyPtr input_tensor = inputs.at(0).tensor();
2327 const int perm_size =
2328 params->use_implicit_batch ? perm.size() - 1 : perm.size();
2329 if (perm_size != size_t(input_tensor->getDimensions().nbDims)) {
2330 return errors::InvalidArgument(
2331 "Rank of perm for transpose does not match with that of the input.");
2332 }
2333 if (params->use_implicit_batch && perm[0] != 0) {
2334 return errors::Unimplemented(
2335 "Transpose at batch dimension is not supported.");
2336 }
2337
2338 #if !IS_TRT_VERSION_GE(7, 1, 3, 4)
2339 // TensorRT versions before 7.1.3.4 is slow transposing large tensors.
2340 // So check tensor size, and don't convert if it is too large.
2341 constexpr int64_t kMaxEfficientTranspose = 2500000;
2342 int64_t tensor_size = TrtTensorDimsNumElements(input_tensor->getDimensions());
2343 if (!AllowInefficientTranspose() && tensor_size > kMaxEfficientTranspose) {
2344 return errors::Unimplemented(StrCat("Transpose too large:", tensor_size));
2345 }
2346 #endif
2347
2348 if (params->validation_only) return Status::OK();
2349
2350 // Start conversion.
2351 ITensorProxyPtr output_tensor = nullptr;
2352 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2353 input_tensor, perm, &output_tensor, params->node_def));
2354 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2355 return Status::OK();
2356 }
2357
ConvertShape(OpConverterParams * params)2358 Status ConvertShape(OpConverterParams* params) {
2359 const auto& inputs = params->inputs;
2360 TF_RETURN_IF_ERROR(
2361 CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}}));
2362 if (params->use_implicit_batch) {
2363 return errors::Unimplemented(
2364 "Shape is only supported for explicit batch mode.");
2365 }
2366 if (HasStaticShape(inputs.at(0).GetTrtDims())) {
2367 if (params->validation_only) return Status::OK();
2368 nvinfer1::Dims input_dims = inputs.at(0).GetTrtDims();
2369 nvinfer1::Dims output_dims{1, {input_dims.nbDims}};
2370 // Create a const node with the values of output_dims
2371 TRT_ShapedWeights weight = params->weight_store->GetTempWeights(
2372 nvinfer1::DataType::kINT32, output_dims);
2373 int32* values_ptr = static_cast<int32*>(weight.GetValues());
2374 std::copy(input_dims.d, input_dims.d + input_dims.nbDims, values_ptr);
2375 auto output = params->converter->CreateConstantLayer(weight, output_dims);
2376 params->outputs->push_back(TRT_TensorOrWeights(output));
2377 return Status::OK();
2378 }
2379 if (params->validation_only) return Status::OK();
2380 nvinfer1::IShapeLayer* shape_layer = params->converter->network()->addShape(
2381 *inputs.at(0).tensor()->trt_tensor());
2382 TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
2383 params->converter->SetLayerName(shape_layer, params->node_def, "shape");
2384 params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
2385 return Status::OK();
2386 }
2387
ExpectShapeTensor(const TRT_TensorOrWeights & tensor)2388 Status ExpectShapeTensor(const TRT_TensorOrWeights& tensor) {
2389 if (tensor.tensor()->getType() != nvinfer1::DataType::kINT32) {
2390 return errors::InvalidArgument("Expected a shape tensor with INT32 type");
2391 }
2392 if (tensor.GetTrtDims().nbDims > 1) {
2393 return errors::InvalidArgument("Expected a 0D or 1D shape tensor");
2394 }
2395 return Status::OK();
2396 }
2397
2398 // Converts Reshape op if the input has dynamic (unknown) dims.
ConvertDynamicReshape(OpConverterParams * params)2399 Status ConvertDynamicReshape(OpConverterParams* params) {
2400 if (params->use_implicit_batch) {
2401 return errors::InvalidArgument(
2402 "The input \"shape\" for Reshape must be a constant in implicit batch"
2403 " mode, at ",
2404 params->node_def.name());
2405 }
2406 if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) {
2407 // While officially TRT supports shape value input , there are problems with
2408 // shape input handling that cause networks converted with
2409 // ConvertDynamicReshape fail. Here we conservatively switch off the
2410 // converter before TRT 7.1.3.
2411 return errors::InvalidArgument(
2412 "Non constant shape input tensor for Reshape requires minimum TRT "
2413 "7.1.3");
2414 }
2415 const auto& inputs = params->inputs;
2416 const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2417
2418 // If the input is a tensor it must be a shape tensor.
2419 TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1)));
2420 if (inputs.at(1).tensor()->getDimensions().nbDims == 0) {
2421 // Dynamic reshape requires a 1D shape tensor.
2422 return errors::Unimplemented(
2423 "Reshape with dynamic input requires 1D input tensor, at ",
2424 params->node_def.name());
2425 }
2426 if (params->validation_only) return Status::OK();
2427 nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
2428 *input_tensor.tensor()->trt_tensor());
2429 VLOG(2) << "ConvertReshape setInput (1) "
2430 << DebugString(inputs.at(1).tensor()->getDimensions());
2431 layer->setInput(1, *inputs.at(1).tensor()->trt_tensor());
2432 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
2433 return Status::OK();
2434 }
2435
2436 // Converts Reshape in explicit batch mode if the input has static (known) dims.
ConvertStaticReshapeForExplicitBatchMode(OpConverterParams * params,const int * output_dims,int num_dims,ITensorProxyPtr * output_tensor)2437 Status ConvertStaticReshapeForExplicitBatchMode(
2438 OpConverterParams* params, const int* output_dims, int num_dims,
2439 ITensorProxyPtr* output_tensor) {
2440 nvinfer1::Dims dims;
2441 dims.nbDims = num_dims;
2442 std::copy(output_dims, output_dims + num_dims, dims.d);
2443 return PrepareTensorForShape(params->converter, params->inputs.at(0), dims,
2444 params->validation_only, output_tensor,
2445 params->node_def);
2446 }
2447
2448 // Converts Reshape in implicit batch mode. The input has static (known) dims.
ConvertStaticReshapeForImplicitBatchMode(OpConverterParams * params,const int * output_shape_dims,int output_shape_dims_count,ITensorProxyPtr * output_tensor)2449 Status ConvertStaticReshapeForImplicitBatchMode(
2450 OpConverterParams* params, const int* output_shape_dims,
2451 int output_shape_dims_count, ITensorProxyPtr* output_tensor) {
2452 const auto& inputs = params->inputs;
2453 const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2454 const int input_batch_dim = input_tensor.batch_size();
2455 const int output_batch_dim =
2456 (output_shape_dims_count > 0) ? output_shape_dims[0] : 0;
2457
2458 const nvinfer1::Dims input_nonbatch_dims = input_tensor.GetTrtDims();
2459 nvinfer1::Dims output_nonbatch_dims;
2460 output_nonbatch_dims.nbDims = output_shape_dims_count - 1;
2461 for (int i = 1; i < output_shape_dims_count; i++) {
2462 output_nonbatch_dims.d[i - 1] = output_shape_dims[i];
2463 }
2464
2465 VLOG(1) << "input_batch_dim=" << input_batch_dim
2466 << ", input_nonbatch_dims=" << DebugString(input_nonbatch_dims)
2467 << "\nresult_batch_dim=" << output_batch_dim
2468 << ", result_nonbatch_dims=" << DebugString(output_nonbatch_dims);
2469
2470 // Check whether input_batch_dim and output_batch_dim will have the same
2471 // static value.
2472 bool reshape_may_change_batch_dim = false;
2473 if (input_batch_dim != -1 && output_batch_dim != -1) {
2474 reshape_may_change_batch_dim = (input_batch_dim != output_batch_dim);
2475 } else {
2476 reshape_may_change_batch_dim =
2477 !AreDimsStaticWithSameSize(input_nonbatch_dims, output_nonbatch_dims);
2478 }
2479 if (reshape_may_change_batch_dim) {
2480 const string msg =
2481 StrCat("Reshape on batch dimension is not supported, at ",
2482 params->node_def.name(), ". input_batch_dim=", input_batch_dim,
2483 ", ", DebugString(input_nonbatch_dims),
2484 "; output_batch_dim=", output_batch_dim, ", ",
2485 DebugString(output_nonbatch_dims));
2486 return errors::Unimplemented(msg);
2487 }
2488 // Perform the conversion.
2489 return PrepareTensorForShape(params->converter, input_tensor,
2490 output_nonbatch_dims, params->validation_only,
2491 output_tensor, params->node_def);
2492 }
2493
ConvertReshape(OpConverterParams * params)2494 Status ConvertReshape(OpConverterParams* params) {
2495 const auto& inputs = params->inputs;
2496 TF_RETURN_IF_ERROR(CheckInputsWeights(
2497 *params,
2498 {{"tensor", TrtInputArg::kTensor}, {"shape", TrtInputArg::kBoth}}));
2499 TF_RETURN_IF_ERROR(AllowDataTypes(
2500 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2501 if (inputs.at(1).is_tensor()) {
2502 return ConvertDynamicReshape(params);
2503 }
2504
2505 // TODO(bixia): we can't use inputs.at(1).weights().ToVector<int>() for two
2506 // reasons: (1) When weights.count()==0, TRT_ShapedWeights::tensor_ dtype is
2507 // not properly set to INT32. (2) I tried a fix for the first problem, I got
2508 // shared pointer related error in convert_nodes_test. We should fix the
2509 // problems and switch to use inputs.at(1).weights().ToVector<int>(), a type
2510 // safe method to access the content of the tensor.
2511 TRT_ShapedWeights weights = inputs.at(1).weights();
2512 if (weights.count() == 0 && params->use_implicit_batch) {
2513 return errors::Unimplemented("Reshape to shape=[] is not supported, at ",
2514 params->node_def.name());
2515 }
2516
2517 const int* output_shape_dims = static_cast<int*>(weights.GetValues());
2518 size_t output_shape_dims_count = weights.count();
2519 ITensorProxyPtr output_tensor = nullptr;
2520
2521 if (!params->use_implicit_batch) {
2522 TF_RETURN_IF_ERROR(ConvertStaticReshapeForExplicitBatchMode(
2523 params, output_shape_dims, output_shape_dims_count, &output_tensor));
2524 } else {
2525 TF_RETURN_IF_ERROR(ConvertStaticReshapeForImplicitBatchMode(
2526 params, output_shape_dims, output_shape_dims_count, &output_tensor));
2527 }
2528 if (params->validation_only) return Status::OK();
2529
2530 // Record the conversion result.
2531 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2532 return Status::OK();
2533 }
2534
ConvertExpandDims(OpConverterParams * params)2535 Status ConvertExpandDims(OpConverterParams* params) {
2536 const auto& inputs = params->inputs;
2537 const auto& node_def = params->node_def;
2538 TF_RETURN_IF_ERROR(
2539 CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
2540 TF_RETURN_IF_ERROR(AllowDataTypes(
2541 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2542 // Get input shape as vector.
2543 const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2544 const nvinfer1::Dims dims = input_tensor.GetTrtDims();
2545 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2546 // Get axis to expand on.
2547 auto axis = inputs.at(1).weights().GetSpan<int>();
2548 if (axis.size() != 1) {
2549 return errors::InvalidArgument("ExpandDims axis must be a scalar, at ",
2550 node_def.name());
2551 }
2552 // Use rank = nbDims + 1 for ConvertAxis's bounds checking to account for
2553 // ExpandDim's ability to add an axis at end of the shape.
2554 int trt_axis;
2555 TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dims.nbDims + 1, node_def.name(),
2556 params->use_implicit_batch, &trt_axis));
2557 if (params->validation_only) return Status::OK();
2558 ITensorProxyPtr output_tensor = nullptr;
2559
2560 if (!params->use_implicit_batch && !HasStaticShape(input_dims)) {
2561 TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
2562 input_tensor.tensor(), dims, trt_axis, params, &output_tensor));
2563 } else {
2564 // ExpandDims: Insert new dim of size 1.
2565 input_dims.insert(input_dims.begin() + trt_axis, 1);
2566 // Reshape tensor.
2567 nvinfer1::Dims new_dims;
2568 TF_RETURN_IF_ERROR(ContainerToTrtDims(input_dims, &new_dims));
2569 TF_RETURN_IF_ERROR(PrepareTensorForShape(
2570 params->converter, input_tensor, new_dims, /*validation_only=*/false,
2571 &output_tensor, params->node_def));
2572 }
2573 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2574 return Status::OK();
2575 }
2576
DynamicReshape(ITensorProxyPtr input,std::vector<std::pair<int,int>> slices,OpConverterParams * params,ITensorProxyPtr * output,std::vector<int> size_for_added_dims,absl::optional<int> op_instance)2577 Status Converter::DynamicReshape(ITensorProxyPtr input,
2578 std::vector<std::pair<int, int>> slices,
2579 OpConverterParams* params,
2580 ITensorProxyPtr* output,
2581 std::vector<int> size_for_added_dims,
2582 absl::optional<int> op_instance) {
2583 *output = nullptr;
2584 // DynamicReshape relies on INetworkDefinition::addShape
2585 if (params->validation_only) {
2586 return errors::Internal(
2587 "DynamicReshape should not be used during validation");
2588 }
2589 ITensorProxyPtr shape =
2590 network()->addShape(*input->trt_tensor())->getOutput(0);
2591 // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:]
2592 std::vector<ITensorProxyPtr> concat_inputs;
2593 int max_num_slices = std::max(slices.size(), size_for_added_dims.size());
2594 int op_instance_value = op_instance.has_value() ? op_instance.value() : 0;
2595 for (int i = 0; i < max_num_slices; i++) {
2596 ITensorProxyPtr tensor;
2597 int slice_instance = i * max_num_slices + op_instance_value;
2598 // maybe_add_a_dimension(i);
2599 if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) {
2600 nvinfer1::Dims dims{1, {1}};
2601 if (size_for_added_dims[i] > 0) {
2602 dims.d[0] = size_for_added_dims[i];
2603 }
2604 TF_RETURN_IF_ERROR(
2605 CreateScalarConstant(params, std::min(size_for_added_dims[i], 1),
2606 &tensor, nvinfer1::DataType::kINT32, dims));
2607 concat_inputs.push_back(tensor);
2608 }
2609 if (i < slices.size()) {
2610 nvinfer1::ISliceLayer* slice_layer = network()->addSlice(
2611 *shape->trt_tensor(), {1, {slices[i].first}},
2612 {1, {slices[i].second - slices[i].first}}, {1, {1}});
2613 concat_inputs.push_back(slice_layer->getOutput(0));
2614 SetLayerName(slice_layer, params->node_def, "slice", slice_instance);
2615 }
2616 }
2617 std::vector<nvinfer1::ITensor*> trt_concat_inputs;
2618 for (const auto& t : concat_inputs) {
2619 trt_concat_inputs.push_back(t->trt_tensor());
2620 }
2621 nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation(
2622 static_cast<nvinfer1::ITensor* const*>(trt_concat_inputs.data()),
2623 concat_inputs.size());
2624 SetLayerName(concat_layer, params->node_def, "concat", op_instance);
2625 concat_layer->setAxis(0);
2626 ITensorProxyPtr new_shape = concat_layer->getOutput(0);
2627 // Reshape input using new shape
2628 nvinfer1::IShuffleLayer* shuffle =
2629 network()->addShuffle(*input->trt_tensor());
2630 SetLayerName(shuffle, params->node_def, "shuffle", op_instance);
2631 shuffle->setInput(1, *new_shape->trt_tensor());
2632 *output = shuffle->getOutput(0);
2633 return Status::OK();
2634 }
2635
DynamicExpandDims(ITensorProxyPtr input,const nvinfer1::Dims & dims,int axis,OpConverterParams * params,ITensorProxyPtr * output,absl::optional<int> op_instance)2636 Status Converter::DynamicExpandDims(ITensorProxyPtr input,
2637 const nvinfer1::Dims& dims, int axis,
2638 OpConverterParams* params,
2639 ITensorProxyPtr* output,
2640 absl::optional<int> op_instance) {
2641 if (params->validation_only) {
2642 *output = nullptr;
2643 return errors::Internal(
2644 "DynamicExpandDims should not be used during validation");
2645 }
2646 std::vector<std::pair<int, int>> slices;
2647 std::vector<int> extra_dims;
2648 if (axis != 0) {
2649 slices.push_back(std::pair<int, int>{0, axis});
2650 extra_dims.push_back(-1);
2651 }
2652 extra_dims.push_back(1);
2653 if (axis != dims.nbDims) {
2654 slices.push_back(std::pair<int, int>{axis, dims.nbDims});
2655 }
2656 return DynamicReshape(input, slices, params, output, extra_dims, op_instance);
2657 }
2658
SqueezeTensor(ITensorProxyPtr input,std::vector<int> * input_dims,OpConverterParams * params,ITensorProxyPtr * output)2659 Status Converter::SqueezeTensor(ITensorProxyPtr input,
2660 std::vector<int>* input_dims,
2661 OpConverterParams* params,
2662 ITensorProxyPtr* output) {
2663 // If the remaining dimensions of a squeeze operation have dynamic sizes, we
2664 // need to use TRT ops to build the result shape for the squeeze operation.
2665 // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
2666 // value.
2667 if (!params->use_implicit_batch && !HasStaticShape(*input_dims)) {
2668 std::vector<std::pair<int, int>> slices;
2669 for (int i = 0; i < input_dims->size(); i++) {
2670 if (input_dims->at(i) != 0) {
2671 slices.push_back(std::pair<int, int>(i, i + 1));
2672 }
2673 }
2674 return DynamicReshape(input, slices, params, output);
2675 }
2676 // Remove all dims which are equal to 0.
2677 input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
2678 input_dims->end());
2679 // Reshape tensor.
2680 nvinfer1::Dims new_dims;
2681 VLOG(2) << "input_dims: " << input_dims;
2682 TF_RETURN_IF_ERROR(ContainerToTrtDims(*input_dims, &new_dims));
2683 TF_RETURN_IF_ERROR(PrepareTensorForShape(
2684 params->converter, TRT_TensorOrWeights(input), new_dims,
2685 /*validation_only=*/false, output, params->node_def));
2686 return Status::OK();
2687 }
2688
ConvertSqueeze(OpConverterParams * params)2689 Status ConvertSqueeze(OpConverterParams* params) {
2690 const auto& inputs = params->inputs;
2691 const auto& node_def = params->node_def;
2692 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
2693 TF_RETURN_IF_ERROR(AllowDataTypes(
2694 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2695 // Get input shape.
2696 const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2697 const nvinfer1::Dims dims = input_tensor.GetTrtDims();
2698 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2699 TFAttrs attrs(node_def);
2700 auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
2701 if (squeeze_dims.empty()) {
2702 if (params->use_implicit_batch || !HasStaticShape(dims)) {
2703 return errors::Unimplemented(
2704 "Squeeze is not implemented for empty squeeze_dims, at ",
2705 node_def.name());
2706 } else {
2707 // explicit batch mode with static input shape we squeeze all singleton
2708 // dimensions
2709 for (int& dim : input_dims) {
2710 if (dim == 1) {
2711 // Mark it for removal by setting it to 0
2712 dim = 0;
2713 }
2714 }
2715 }
2716 } else {
2717 std::vector<int> trt_axes;
2718 trt_axes.reserve(squeeze_dims.size());
2719 for (int tf_axis : squeeze_dims) {
2720 // If the axis is valid, then convert it to TRT axis, otherwise abort
2721 // conversion.
2722 int trt_axis;
2723 TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
2724 params->use_implicit_batch, &trt_axis));
2725 // Make sure target dimension is size 1 or unknown size (-1)
2726 if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
2727 return errors::InvalidArgument(
2728 "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
2729 " cannot be squeezed because it must be size 1, at ",
2730 node_def.name());
2731 }
2732 trt_axes.push_back(trt_axis);
2733 }
2734 // Mark axes to remove by setting them to 0.
2735 for (int axis : trt_axes) {
2736 input_dims[axis] = 0;
2737 }
2738 }
2739 if (params->validation_only) return Status::OK();
2740
2741 ITensorProxyPtr output_tensor = nullptr;
2742 TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
2743 input_tensor.tensor(), &input_dims, params, &output_tensor));
2744 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2745 return Status::OK();
2746 }
2747
2748 template <typename Container>
ConvertStridedSliceHelper(OpConverterParams * params,const TRT_TensorOrWeights & input,Container begin,Container size,const Container & stride,const nvinfer1::Dims * final_shape=nullptr,absl::optional<int> op_instance=absl::nullopt)2749 Status ConvertStridedSliceHelper(
2750 OpConverterParams* params, const TRT_TensorOrWeights& input,
2751 Container begin, Container size, const Container& stride,
2752 const nvinfer1::Dims* final_shape = nullptr,
2753 absl::optional<int> op_instance = absl::nullopt) {
2754 if (!params->use_implicit_batch &&
2755 (!HasStaticShape(begin) || !HasStaticShape(size))) {
2756 return errors::Unimplemented(
2757 "Strided slice op not implemented for dynamic shape input");
2758 }
2759 const auto& node_def = params->node_def;
2760 // Get input dims.
2761 nvinfer1::Dims dims = input.GetTrtDims();
2762 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2763 if (params->use_implicit_batch) {
2764 // Begin, size and stride does include explicit batch dim. Add batch
2765 // dimension to input_dims so that indexes line up properly.
2766 input_dims.insert(input_dims.begin(), -1);
2767 }
2768 // Check bounds.
2769 for (int i = 1; i < input_dims.size(); i++) {
2770 if (input_dims[i] < 0 || size[i] < 0) continue;
2771 if (begin[i] < 0 || begin[i] > input_dims[i]) {
2772 return errors::InvalidArgument("\"begin\" for dimension ",
2773 std::to_string(i), " in ", node_def.op(),
2774 " is out of range, at ", node_def.name());
2775 }
2776 int end = begin[i];
2777 if (size[i] > 0) end += (size[i] - 1) * stride[i];
2778 if (end < 0 || end > input_dims[i]) {
2779 return errors::InvalidArgument("\"begin\" + \"size\" for dimension ",
2780 std::to_string(i), " in ", node_def.op(),
2781 " is out of range, at ", node_def.name());
2782 }
2783 }
2784
2785 nvinfer1::Dims begin_dims, size_dims, stride_dims;
2786 TF_RETURN_IF_ERROR(
2787 ContainerToTrtDims(begin, &begin_dims,
2788 /*ignore_first_dim=*/params->use_implicit_batch));
2789 TF_RETURN_IF_ERROR(
2790 ContainerToTrtDims(size, &size_dims,
2791 /*ignore_first_dim=*/params->use_implicit_batch));
2792 TF_RETURN_IF_ERROR(
2793 ContainerToTrtDims(stride, &stride_dims, params->use_implicit_batch));
2794 if (params->validation_only) return Status::OK();
2795
2796 VLOG(2) << "Adding slice layer with begin=" << DebugString(begin_dims)
2797 << ", size=" << DebugString(size_dims)
2798 << ", stride=" << DebugString(stride_dims);
2799 nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
2800 *input.tensor()->trt_tensor(), begin_dims, size_dims, stride_dims);
2801 params->converter->SetLayerName(layer, params->node_def, "slice",
2802 op_instance);
2803 ITensorProxyPtr tensor = layer->getOutput(0);
2804 // Reshape for shrink_axis.
2805 if (final_shape) {
2806 TF_RETURN_IF_ERROR(PrepareTensorForShape(
2807 params->converter, TRT_TensorOrWeights(tensor), *final_shape,
2808 /*validation_only=*/false, &tensor, node_def, op_instance));
2809 }
2810 params->outputs->push_back(TRT_TensorOrWeights(tensor));
2811 return Status::OK();
2812 }
2813
ConvertSlice(OpConverterParams * params)2814 Status ConvertSlice(OpConverterParams* params) {
2815 const auto& inputs = params->inputs;
2816 const auto& node_def = params->node_def;
2817 TF_RETURN_IF_ERROR(CheckInputsWeights(
2818 *params, {{"input", false}, {"begin", true}, {"size", true}}));
2819 TF_RETURN_IF_ERROR(AllowDataTypes(
2820 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2821 std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
2822 std::vector<int> size = inputs.at(2).weights().ToVector<int>();
2823 // Get input dims.
2824 nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
2825 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2826 // Add batch dimension so that indexes line up properly.
2827 if (params->use_implicit_batch) {
2828 input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
2829 }
2830 if (!AllLengthsEqual({input_dims, begin, size})) {
2831 return errors::InvalidArgument(
2832 "Length of begin and size arguments must equal rank of input for "
2833 "Slice, at ",
2834 node_def.name());
2835 }
2836 // Check that batch dimension is unmodified.
2837 if (params->use_implicit_batch) {
2838 const bool begin_is_modified = begin[0] != 0;
2839 // If size[0]s is not -1, we can only know if the batch dimension is
2840 // unmodified when the batch size is defined. When the batch size is
2841 // undefined, we don't convert to be safe.
2842 const bool size_is_unchanged = size[0] == -1 || size[0] == input_dims[0];
2843 if (begin_is_modified || !size_is_unchanged) {
2844 return errors::Unimplemented(
2845 "TensorRT does not allow modifications to the batch dimension, at ",
2846 node_def.name());
2847 }
2848 }
2849 // Size of -1 signifies to take all remaining elements.
2850 for (int i = 0; i < input_dims.size(); i++) {
2851 if (size[i] == -1) {
2852 if (input_dims[i] == -1) {
2853 return errors::Unimplemented(
2854 "Input dims must be defined for size = -1, at ", node_def.name());
2855 }
2856 size[i] = input_dims[i] - begin[i];
2857 } else if (size[i] < -1) {
2858 return errors::InvalidArgument("Invalid size value at ", node_def.name());
2859 }
2860 if (input_dims[i] != -1 && (begin[i] < 0 || begin[i] > input_dims[i])) {
2861 return errors::InvalidArgument("\"begin\" for dimension ",
2862 std::to_string(i), " in ", node_def.op(),
2863 " is out of range, at ", node_def.name());
2864 }
2865 const int end = begin[i] + size[i];
2866 if (input_dims[i] != -1 && (end < 0 || end > input_dims[i])) {
2867 return errors::InvalidArgument("\"begin\" + \"size\" for dimension ",
2868 std::to_string(i), " in ", node_def.op(),
2869 " is out of range, at ", node_def.name());
2870 }
2871 }
2872 // Stride is 1 for all dims.
2873 std::vector<int> stride(begin.size(), 1);
2874 return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
2875 }
2876
ConvertStridedSlice(OpConverterParams * params)2877 Status ConvertStridedSlice(OpConverterParams* params) {
2878 const auto& inputs = params->inputs;
2879 const auto& node_def = params->node_def;
2880 // The TF op allows negative begin/end indices while TRT requires values
2881 // within bounds. This is because we use the the default slice mode
2882 // (see ISliceLayer::SetMode) with TRT: "Fail with error when the coordinates
2883 // are out of bounds". If begin/end tensors have negative values then we map
2884 // them to positive vales. The way this is currently implemented requires that
2885 // begin / end are constants, therefore we allow only weighs for begin / end.
2886 //
2887 // The output size is determined by begin, end and strides. For shape tensors
2888 // TRT requires that the output size is known at engine construction time. To
2889 // reduce complexity of the converter, we also require constant size for non
2890 // shape input. This implies that the stride input also have to be a constant
2891 // (weights).
2892 TF_RETURN_IF_ERROR(CheckInputsWeights(
2893 *params,
2894 {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
2895 TF_RETURN_IF_ERROR(AllowDataTypes(
2896 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2897
2898 // TODO(tfeher): Enable dynamic shape input.
2899 if (!HasStaticShape(inputs.at(0).GetTrtDims())) {
2900 return errors::Unimplemented(
2901 "Strided slice op not implemented for dynamic shape input");
2902 }
2903 TFAttrs attrs(node_def);
2904 // New_axis_mask is not supported. TODO(tfeher): Support this by expanddims.
2905 const int32 new_axis_mask = attrs.get<int64>("new_axis_mask");
2906 if (new_axis_mask != 0) {
2907 return errors::Unimplemented(
2908 "new_axis_mask is not supported for StridedSlice, at ",
2909 node_def.name());
2910 }
2911 const int32 begin_mask = attrs.get<int64>("begin_mask");
2912 const int32 end_mask = attrs.get<int64>("end_mask");
2913 const int32 ellipsis_mask = attrs.get<int64>("ellipsis_mask");
2914 const int32 shrink_axis_mask = attrs.get<int64>("shrink_axis_mask");
2915
2916 // Get input dims.
2917 nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
2918 std::vector<int64> input_dims(dims.d, dims.d + dims.nbDims);
2919 // Add batch dimension so that indexes line up properly. Set it to -1 if it's
2920 // unknown, so ValidateStridedSliceOp() can handle it correctly below.
2921 if (params->use_implicit_batch) {
2922 input_dims.insert(input_dims.begin(),
2923 std::max(-1, inputs.at(0).batch_size()));
2924 }
2925
2926 const TRT_ShapedWeights& begin_weights = inputs.at(1).weights();
2927 const TRT_ShapedWeights& end_weights = inputs.at(2).weights();
2928 const TRT_ShapedWeights& stride_weights = inputs.at(3).weights();
2929 if (!AllLengthsEqual({begin_weights.ToVector<int>(),
2930 end_weights.ToVector<int>(),
2931 stride_weights.ToVector<int>()})) {
2932 return errors::InvalidArgument(
2933 "Length of begin, end, and stride must be equal, at ", node_def.name());
2934 }
2935
2936 // The slice op has many ways to define the actual operation that needs to be
2937 // performed. We use ValidateStridedSliceOp to map the input parameters to
2938 // begin, end, & strides. ValidateStridedSliceOp makes an effort to set known
2939 // (static) begin/end/strides parameters. On return, begin, end, stride,
2940 // processing_shape has the same rank as input. final_shape has extra dims
2941 // added/removed. Negative values in begin/end/stride are converted to
2942 // positive values to produce a known processing_shape if the input shape is
2943 // static. Otherwise, processing_shape and final_shape may contain unknown
2944 // dimension values.
2945 PartialTensorShape input_shape(input_dims);
2946 PartialTensorShape processing_shape;
2947 PartialTensorShape final_shape;
2948 bool is_identity;
2949 bool is_simple_slice;
2950 bool slice_dim0;
2951 absl::InlinedVector<int64, 4> begin;
2952 absl::InlinedVector<int64, 4> end;
2953 absl::InlinedVector<int64, 4> strides;
2954 TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
2955 &begin_weights.GetTensor(), &end_weights.GetTensor(),
2956 stride_weights.GetTensor(), input_shape, begin_mask, end_mask,
2957 ellipsis_mask, new_axis_mask, shrink_axis_mask, &processing_shape,
2958 &final_shape, &is_identity, &is_simple_slice, &slice_dim0, &begin, &end,
2959 &strides));
2960
2961 // If batch dimension is covered by the ellipsis mask, it means it's left
2962 // untouched. Otherwise we check whether it modifies the batch dimension here.
2963 if (params->use_implicit_batch &&
2964 (!(ellipsis_mask & 1) ||
2965 begin_weights.shape_.nbDims >= input_dims.size())) {
2966 // Check that batch dimension is unmodified. We need to use the expanded
2967 // begin/end/strides array since the original array may be incorrect when
2968 // (ellipsis_mask&1)==1.
2969 const bool begin_is_modified = !(begin_mask & 1) && (begin[0] != 0);
2970 const bool stride_is_modified = (strides[0] != 1);
2971 // If the batch size is -1 and the end mask is not set, we can only know if
2972 // the batch dimension is unmodified when the batch size is defined. When
2973 // the batch size is undefined, we don't convert to be safe.
2974 const bool batch_size_is_defined = (input_dims[0] > 0);
2975 const bool end_is_modified =
2976 !(end_mask & 1) && (!batch_size_is_defined ||
2977 (batch_size_is_defined && end[0] != input_dims[0]));
2978 if (begin_is_modified || stride_is_modified || end_is_modified) {
2979 return errors::Unimplemented(
2980 "TensorRT does not allow modifications to the batch dimension, at ",
2981 node_def.name());
2982 }
2983 }
2984 // Can't shrink axis on batch dimension.
2985 if (params->use_implicit_batch && shrink_axis_mask & 1) {
2986 return errors::Unimplemented(
2987 "TensorRT does not allow modifications to the batch dimension, at ",
2988 node_def.name());
2989 }
2990
2991 // TRT Slice layer uses (begin, size) instead of (begin, end). We calculate
2992 // the size if possible, otherwise we set it to -1.
2993 absl::InlinedVector<int64, 4> size(input_dims.size());
2994 for (int i = 0; i < input_dims.size(); i++) {
2995 if (input_dims[i] < 0) {
2996 // Often begin[i] and end[i] could be used to calculate the size.
2997 // (Although the presence of begin/end manks make it non-trivial beacues
2998 // 0 value might indicate that a mask was used). But the size has to be
2999 // clamped to match the array size, for which we need to use the dynamic
3000 // version of the helper routines. Therefore we set size to -1,
3001 // which will select the dynamic shape helper (to be implemented).
3002 size[i] = -1;
3003 continue;
3004 }
3005 // Divide by stride (round up).
3006 size[i] = strides[i] > 0
3007 ? (end[i] - begin[i] + strides[i] - 1) / strides[i]
3008 : (begin[i] - end[i] + abs(strides[i]) - 1) / abs(strides[i]);
3009 if (size[i] < 0) {
3010 return errors::InvalidArgument(
3011 "\"size\" cannot be negative for StridedSlice");
3012 }
3013 }
3014
3015 // shrink_axis_mask requires a reshape after the slice.
3016 nvinfer1::Dims final_shape_dims;
3017 nvinfer1::Dims* final_shape_dims_ptr = nullptr;
3018 if (shrink_axis_mask) {
3019 TF_RETURN_IF_ERROR(TensorShapeToTrtDims(
3020 final_shape, /*ignore_first_dim=*/params->use_implicit_batch,
3021 &final_shape_dims));
3022 final_shape_dims_ptr = &final_shape_dims;
3023 }
3024
3025 return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, strides,
3026 final_shape_dims_ptr, 0);
3027 }
3028
ConvertConv2D(OpConverterParams * params)3029 Status ConvertConv2D(OpConverterParams* params) {
3030 return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false);
3031 }
3032
ConvertConv2DDepthwise(OpConverterParams * params)3033 Status ConvertConv2DDepthwise(OpConverterParams* params) {
3034 return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false);
3035 }
3036
ConvertConv2DBackpropInput(OpConverterParams * params)3037 Status ConvertConv2DBackpropInput(OpConverterParams* params) {
3038 return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
3039 }
3040
ConvertConv3DHelper(OpConverterParams * params,int group,bool is_conv3d_backprop_input=false)3041 Status ConvertConv3DHelper(OpConverterParams* params, int group,
3042 bool is_conv3d_backprop_input = false) {
3043 const int kNumDims = 5;
3044 const auto& inputs = params->inputs;
3045 const auto& node_def = params->node_def;
3046 TRT_TensorOrWeights backprop_output_size;
3047 ITensorProxyPtr tensor = nullptr;
3048 if (is_conv3d_backprop_input) {
3049 // In the case when Conv3dBackpropInput is used for conv3d_transpose, these
3050 // inputs correspond to: output size, filter, and input.
3051 TF_RETURN_IF_ERROR(CheckInputsWeights(
3052 *params,
3053 {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
3054 backprop_output_size = inputs.at(0);
3055 tensor = inputs.at(2).tensor();
3056 } else {
3057 TF_RETURN_IF_ERROR(
3058 CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
3059 tensor = inputs.at(0).tensor();
3060 }
3061 TF_RETURN_IF_ERROR(
3062 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3063 const TRT_ShapedWeights weights_drsck = inputs.at(1).weights();
3064 if (weights_drsck.shape_.nbDims != kNumDims) {
3065 return errors::InvalidArgument("Conv3D expects kernel of dimension 5, at ",
3066 node_def.name());
3067 }
3068 TFAttrs attrs(node_def);
3069 auto data_format = attrs.get<string>("data_format");
3070 const bool is_ndhwc = (data_format == "NDHWC"); // Or NCDHW 01234 - > 02341
3071 const int d_index = is_ndhwc ? 1 : 2;
3072 const int h_index = is_ndhwc ? 2 : 3;
3073 const int w_index = is_ndhwc ? 3 : 4;
3074 const int c_index = is_ndhwc ? 4 : 1;
3075 auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
3076 if (tf_dilations.size() != kNumDims) {
3077 return errors::InvalidArgument(
3078 "Convolution dilations field must specify 5 dimensions, at ",
3079 node_def.name());
3080 }
3081 if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
3082 return errors::Unimplemented(
3083 "Dilation rate must be 1 for batch and channel dimensions, at ",
3084 node_def.name());
3085 }
3086
3087 const nvinfer1::Dims3 dilation_dhw(
3088 tf_dilations[d_index], tf_dilations[h_index], tf_dilations[w_index]);
3089 if (is_conv3d_backprop_input &&
3090 (dilation_dhw.d[0] != 1 || dilation_dhw.d[1] != 1 ||
3091 dilation_dhw.d[2] != 1)) {
3092 return errors::Unimplemented(
3093 "Dilation with Conv3DBackpropInputV2 (conv3d_transpose) is not "
3094 "supported",
3095 ", at ", node_def.name());
3096 }
3097
3098 const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3099 if (tf_stride.size() != kNumDims) {
3100 return errors::InvalidArgument(
3101 "Convolution strides field must specify 5 dimensions, at ",
3102 node_def.name());
3103 }
3104 if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3105 return errors::Unimplemented(
3106 "Stride must be 1 for batch and channel dimensions, at ",
3107 node_def.name());
3108 }
3109
3110 const nvinfer1::Dims3 stride_dhw(tf_stride[d_index], tf_stride[h_index],
3111 tf_stride[w_index]);
3112 const auto tensor_dim = tensor->getDimensions();
3113
3114 // Asymmetric padding on Deconv not supported for now
3115 if (is_conv3d_backprop_input && attrs.get<string>("padding") == "SAME") {
3116 TRT_ShapedWeights weights =
3117 params->weight_store->GetTempWeights(weights_drsck);
3118
3119 nvinfer1::Dims3 effective_kernel_size(
3120 weights.shape_.d[0] +
3121 (weights.shape_.d[0] - 1) * (dilation_dhw.d[0] - 1), // D
3122 weights.shape_.d[1] +
3123 (weights.shape_.d[1] - 1) * (dilation_dhw.d[1] - 1), // R
3124 weights.shape_.d[2] +
3125 (weights.shape_.d[2] - 1) * (dilation_dhw.d[2] - 1) // S
3126 );
3127
3128 const auto output_size_weights =
3129 static_cast<int*>(backprop_output_size.weights().GetValues());
3130 const std::vector<int64_t> input_dims = {output_size_weights[d_index],
3131 output_size_weights[h_index],
3132 output_size_weights[w_index]};
3133
3134 const std::vector<std::pair<int, int>> padding =
3135 CreateSamePadding(stride_dhw, effective_kernel_size, input_dims);
3136
3137 if (padding[0].first != padding[0].second ||
3138 padding[1].first != padding[1].second ||
3139 padding[2].first != padding[2].second) {
3140 return errors::Unimplemented(
3141 "Asymmetric padding with Conv3DBackpropInputV2 (conv3d_transpose) is "
3142 "not supported, at ",
3143 node_def.name());
3144 }
3145 }
3146
3147 // Channel dim must be static for Conv3D since we use that value for
3148 // num_groups at build time.
3149 // TODO: Allow conversion if kImplicitBatchModeCompatible||kOptimal is used.
3150 int implicit_batch_offset = params->use_implicit_batch ? -1 : 0;
3151 if (tensor->getDimensions().d[c_index + implicit_batch_offset] == -1) {
3152 return errors::InvalidArgument("Channel dimension must be static, at ",
3153 node_def.name());
3154 }
3155
3156 // Finished validation checks
3157 if (params->validation_only) return Status::OK();
3158
3159 // Transpose to NCDHW (NCDHW is required for IConvLayer).
3160 const bool need_transpose = is_ndhwc;
3161 if (need_transpose) {
3162 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3163 tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
3164 }
3165
3166 // group == 0 signifies that this is a depthwise convolution, so set
3167 // num_groups to size of input's channel dim. For a non-depthwise conv,
3168 // num_groups will be 1.
3169 const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
3170
3171 // For conv, TF weights are DRSCK, and TRT expects KCDRS.
3172 // For backprop, TF weights are DRSKC, and TRT expects KCDRS.
3173 // Therefore, this reorder will work for both cases.
3174 TRT_ShapedWeights weights =
3175 params->weight_store->GetTempWeights(weights_drsck);
3176 ReorderDRSCKToKCDRS(weights_drsck, &weights, num_groups);
3177 TRT_ShapedWeights biases(weights.TrtDType());
3178 const int output_axis = is_conv3d_backprop_input ? 1 : 0;
3179 const int noutput = weights.shape_.d[output_axis] * num_groups;
3180 nvinfer1::Dims3 kernel_size_drs(weights.shape_.d[2], // D
3181 weights.shape_.d[3], // R
3182 weights.shape_.d[4] // S
3183 );
3184
3185 // Add convolution.
3186 nvinfer1::ILayer* conv_layer = nullptr;
3187 if (is_conv3d_backprop_input) {
3188 nvinfer1::IDeconvolutionLayer* layer =
3189 params->converter->network()->addDeconvolutionNd(
3190 *tensor->trt_tensor(), noutput, kernel_size_drs,
3191 weights.GetTrtWeights(), biases.GetTrtWeights());
3192 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3193 layer->setStrideNd(stride_dhw); // change to nd set stride
3194
3195 if (attrs.get<string>("padding") == "SAME") {
3196 VLOG(2) << "Using SAME padding";
3197 // SAME_UPPER means that post padding is preferred.
3198 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3199 }
3200
3201 layer->setNbGroups(num_groups);
3202 conv_layer = layer;
3203 } else {
3204 nvinfer1::IConvolutionLayer* layer =
3205 params->converter->network()->addConvolutionNd(
3206 *tensor->trt_tensor(), noutput, kernel_size_drs,
3207 weights.GetTrtWeights(), biases.GetTrtWeights());
3208 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3209 layer->setStrideNd(stride_dhw);
3210
3211 if (attrs.get<string>("padding") == "SAME") {
3212 VLOG(2) << "Using SAME padding";
3213 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3214 }
3215
3216 layer->setNbGroups(num_groups);
3217 layer->setDilationNd(dilation_dhw);
3218 conv_layer = layer;
3219 }
3220 params->converter->SetLayerName(conv_layer, node_def, "conv");
3221 ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
3222
3223 // Restore transpose.
3224 if (need_transpose) {
3225 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3226 output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
3227 }
3228 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3229 return Status::OK();
3230 }
3231
ConvertConv3D(OpConverterParams * params)3232 Status ConvertConv3D(OpConverterParams* params) {
3233 return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/false);
3234 }
3235
ConvertConv3DBackpropInputV2(OpConverterParams * params)3236 Status ConvertConv3DBackpropInputV2(OpConverterParams* params) {
3237 return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/true);
3238 }
3239
ConvertPool3D(OpConverterParams * params)3240 Status ConvertPool3D(OpConverterParams* params) {
3241 const int kNumDims = 5;
3242 const auto& inputs = params->inputs;
3243 const auto& node_def = params->node_def;
3244 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3245 TF_RETURN_IF_ERROR(
3246 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3247 nvinfer1::PoolingType type;
3248 if (node_def.op() == "MaxPool3D") {
3249 type = nvinfer1::PoolingType::kMAX;
3250 } else if (node_def.op() == "AvgPool3D") {
3251 type = nvinfer1::PoolingType::kAVERAGE;
3252 } else {
3253 return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
3254 ", at ", node_def.name());
3255 }
3256 TFAttrs attrs(node_def);
3257 const string padding_type = attrs.get<string>("padding");
3258 if ((padding_type != "SAME") && (padding_type != "VALID")) {
3259 return errors::Unimplemented("Unsupported padding type: ", padding_type,
3260 ", at ", node_def.name());
3261 }
3262 const auto data_format = attrs.get<string>("data_format");
3263 const bool is_ndhwc = (data_format == "NDHWC");
3264 const int c_index = is_ndhwc ? 4 : 1;
3265 const int d_index = is_ndhwc ? 1 : 2;
3266 const int h_index = is_ndhwc ? 2 : 3;
3267 const int w_index = is_ndhwc ? 3 : 4;
3268 const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3269 if (tf_stride.size() != kNumDims) {
3270 return errors::InvalidArgument(
3271 "Pooling strides field must specify 5 dimensions, at ",
3272 node_def.name());
3273 }
3274 if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3275 return errors::Unimplemented(
3276 "stride must be 1 for batch and channel dimensions, at ",
3277 node_def.name());
3278 }
3279 const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
3280 if (tf_kernel.size() != kNumDims) {
3281 return errors::InvalidArgument(
3282 "Pooling ksize field must specify 5 dimensions, at ", node_def.name());
3283 }
3284 if (tf_kernel[0] != 1 || tf_kernel[c_index] != 1) {
3285 return errors::Unimplemented(
3286 "ksize must be 1 for batch and channel dimensions, at ",
3287 node_def.name());
3288 }
3289 if (params->validation_only) return Status::OK();
3290
3291 ITensorProxyPtr tensor = inputs.at(0).tensor();
3292 if (data_format == "NDHWC") {
3293 // NDHWC => NCDHW
3294 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3295 tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
3296 }
3297
3298 const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index],
3299 tf_stride[w_index]);
3300 const nvinfer1::Dims3 ksize(tf_kernel[d_index], tf_kernel[h_index],
3301 tf_kernel[w_index]);
3302
3303 nvinfer1::IPoolingLayer* layer = params->converter->network()->addPoolingNd(
3304 *tensor->trt_tensor(), type, ksize);
3305 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3306
3307 layer->setStrideNd(stride);
3308 // VALID padding is the default TRT behavior.
3309 if (padding_type == "SAME") {
3310 // SAME_UPPER means that post padding is preferred.
3311 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3312 }
3313 params->converter->SetLayerName(layer, node_def, "pooling");
3314
3315 ITensorProxyPtr output_tensor = layer->getOutput(0);
3316 if (data_format == "NDHWC") {
3317 // NCDHW => NDHWC
3318 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3319 output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
3320 }
3321
3322 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3323 return Status::OK();
3324 }
3325
ConvertFusedConv2DBiasActivation(OpConverterParams * params)3326 Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
3327 const auto& inputs = params->inputs;
3328 const auto& node_def = params->node_def;
3329
3330 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
3331 {"filter", true},
3332 {"bias", true},
3333 {"side_input", true},
3334 {"conv_input_scale", true},
3335 {"side_input_scale", true}}));
3336 ITensorProxyPtr tensor = inputs.at(0).tensor();
3337 TF_RETURN_IF_ERROR(
3338 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3339 TRT_ShapedWeights weights = inputs.at(1).weights();
3340 if (weights.shape_.nbDims != 4) {
3341 return errors::InvalidArgument(
3342 "FusedConv2DBiasActivation expects kernel of dimension 4, at " +
3343 node_def.name());
3344 }
3345 TFAttrs attrs(node_def);
3346 auto data_format = attrs.get<string>("data_format");
3347 if (data_format != "NHWC" && data_format != "NCHW") {
3348 return errors::InvalidArgument("Unsupported data_format:", data_format,
3349 " at ", node_def.name());
3350 }
3351
3352 int c_index = (data_format == "NHWC") ? 3 : 1;
3353 int h_index = (data_format == "NHWC") ? 1 : 2;
3354 int w_index = (data_format == "NHWC") ? 2 : 3;
3355 auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
3356 if (tf_dilations.size() != 4) {
3357 return errors::InvalidArgument(
3358 "Convolution dilations field must specify 4 dimensions, at ",
3359 node_def.name());
3360 }
3361 if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
3362 return errors::Unimplemented(
3363 "Dilation rate must be 1 for batch and channel dimensions, at ",
3364 node_def.name());
3365 }
3366 const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
3367
3368 const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3369 if (tf_stride.size() != 4) {
3370 return errors::InvalidArgument(
3371 "Convolution strides field must specify 4 dimensions, at ",
3372 node_def.name());
3373 }
3374 if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3375 return errors::Unimplemented(
3376 "Stride must be 1 for batch and channel dimensions, at ",
3377 node_def.name());
3378 }
3379 const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
3380 const auto activation_mode = attrs.get<string>("activation_mode");
3381 auto op_pair = ActivationTypeMap()->find(activation_mode);
3382 if (op_pair == ActivationTypeMap()->end() && activation_mode != "None") {
3383 return errors::Unimplemented("Activation mode: ", activation_mode,
3384 " not supported at: ", node_def.name());
3385 }
3386
3387 const auto filter_format = attrs.get<string>("filter_format");
3388 if (filter_format != "HWIO" && filter_format != "OIHW") {
3389 return errors::InvalidArgument("Unsupported filter_format:", filter_format,
3390 " at ", node_def.name());
3391 }
3392 // Check that there's no side_input or conv_input_scale.
3393 TRT_ShapedWeights side_input = inputs.at(3).weights();
3394 if (side_input.count() != 0) {
3395 return errors::InvalidArgument(
3396 "FusedConv2DBiasActivation doesn't yet support side_input, at " +
3397 node_def.name());
3398 }
3399 TRT_ShapedWeights conv_input_scale = inputs.at(4).weights();
3400 if (conv_input_scale.count() != 1 ||
3401 conv_input_scale.TrtDType() != nvinfer1::DataType::kFLOAT ||
3402 conv_input_scale.GetSpan<float>()[0] != 1.0) {
3403 return errors::InvalidArgument(
3404 "FusedConv2DBiasActivation doesn't yet support conv_input_scale, at " +
3405 node_def.name());
3406 }
3407 if (params->validation_only) return Status::OK();
3408
3409 // Transpose to NCHW (NCHW is required for IConvLayer).
3410 const bool need_transpose = (data_format == "NHWC");
3411 if (need_transpose) {
3412 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3413 tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
3414 }
3415
3416 nvinfer1::DimsHW kernel_size;
3417 if (filter_format == "OIHW") {
3418 kernel_size.h() = weights.shape_.d[2];
3419 kernel_size.w() = weights.shape_.d[3];
3420 } else {
3421 // HWIO.
3422 DCHECK_EQ(filter_format, "HWIO");
3423 kernel_size.h() = weights.shape_.d[0];
3424 kernel_size.w() = weights.shape_.d[1];
3425 }
3426
3427 // Add convolution.
3428 TRT_ShapedWeights biases = inputs.at(2).weights();
3429 nvinfer1::IConvolutionLayer* conv_layer = nullptr;
3430 if (filter_format == "OIHW") {
3431 // Weights are already in the right order.
3432 conv_layer = params->converter->network()->addConvolution(
3433 *tensor->trt_tensor(), weights.shape_.d[0], kernel_size,
3434 weights.GetTrtWeights(), biases.GetTrtWeights());
3435 } else {
3436 // For conv, TF weights are RSCK, and TRT expects KCRS.
3437 DCHECK_EQ(filter_format, "HWIO");
3438 TRT_ShapedWeights weights_kcrs =
3439 params->weight_store->GetTempWeights(weights);
3440 ReorderRSCKToKCRS(weights, &weights_kcrs, 1);
3441 conv_layer = params->converter->network()->addConvolution(
3442 *tensor->trt_tensor(), weights.shape_.d[3], kernel_size,
3443 weights_kcrs.GetTrtWeights(), biases.GetTrtWeights());
3444 }
3445 TFTRT_RETURN_ERROR_IF_NULLPTR(conv_layer, node_def.name());
3446 conv_layer->setStride(stride);
3447 if (attrs.get<string>("padding") == "SAME") {
3448 conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3449 }
3450 params->converter->SetLayerName(conv_layer, node_def, "conv");
3451 conv_layer->setNbGroups(1);
3452 conv_layer->setDilation(dilation);
3453 ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
3454
3455 // Add activation if there is one.
3456 if (op_pair != ActivationTypeMap()->end()) {
3457 nvinfer1::IActivationLayer* activation_layer =
3458 params->converter->network()->addActivation(
3459 *output_tensor->trt_tensor(), op_pair->second);
3460 TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name());
3461 params->converter->SetLayerName(activation_layer, node_def, "activation");
3462 output_tensor = activation_layer->getOutput(0);
3463 }
3464 // Restore transpose.
3465 if (need_transpose) {
3466 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3467 output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
3468 }
3469 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3470 return Status::OK();
3471 }
3472
ConvertPool(OpConverterParams * params)3473 Status ConvertPool(OpConverterParams* params) {
3474 const auto& inputs = params->inputs;
3475 const auto& node_def = params->node_def;
3476 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3477 std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
3478 DataType::DT_INT8};
3479 TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
3480 nvinfer1::PoolingType type;
3481 if (node_def.op() == "MaxPool") {
3482 type = nvinfer1::PoolingType::kMAX;
3483 } else if (node_def.op() == "AvgPool") {
3484 type = nvinfer1::PoolingType::kAVERAGE;
3485 } else {
3486 return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
3487 ", at ", node_def.name());
3488 }
3489 TFAttrs attrs(node_def);
3490 const string padding_type = attrs.get<string>("padding");
3491 if ((padding_type != "SAME") && (padding_type != "VALID")) {
3492 return errors::Unimplemented("Unsupported padding type: ", padding_type,
3493 ", at ", node_def.name());
3494 }
3495 if (params->validation_only) return Status::OK();
3496
3497 ITensorProxyPtr tensor = inputs.at(0).tensor();
3498 int h_index = 2;
3499 int w_index = 3;
3500 const auto data_format = attrs.get<string>("data_format");
3501 if (data_format == "NHWC") {
3502 h_index = 1;
3503 w_index = 2;
3504 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3505 tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
3506 }
3507
3508 const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3509 const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
3510
3511 const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
3512 const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
3513
3514 nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
3515 *tensor->trt_tensor(), type, ksize);
3516 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3517
3518 layer->setStride(stride);
3519 // VALID padding is the default TRT behavior.
3520 if (attrs.get<string>("padding") == "SAME") {
3521 // SAME_UPPER means that post padding is preferred.
3522 layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3523 }
3524 params->converter->SetLayerName(layer, node_def, "pooling");
3525 ITensorProxyPtr output_tensor = layer->getOutput(0);
3526
3527 if (data_format == "NHWC") {
3528 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3529 output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
3530 }
3531 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3532 return Status::OK();
3533 }
3534
ConvertLeakyRelu(OpConverterParams * params)3535 Status ConvertLeakyRelu(OpConverterParams* params) {
3536 const auto& inputs = params->inputs;
3537 const auto& node_def = params->node_def;
3538 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3539 TF_RETURN_IF_ERROR(
3540 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3541 TFAttrs attrs(node_def);
3542 const float alpha = attrs.get<float>("alpha");
3543
3544 // Use IActivationLayer when available.
3545 if (params->validation_only) return Status::OK();
3546
3547 nvinfer1::IActivationLayer* layer =
3548 params->converter->network()->addActivation(
3549 *inputs.at(0).tensor()->trt_tensor(),
3550 nvinfer1::ActivationType::kLEAKY_RELU);
3551 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3552 params->converter->SetLayerName(layer, node_def, "activation");
3553 layer->setAlpha(alpha);
3554 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3555 return Status::OK();
3556 }
3557
ConvertClipByValue(OpConverterParams * params)3558 Status ConvertClipByValue(OpConverterParams* params) {
3559 const auto& inputs = params->inputs;
3560 const auto& node_def = params->node_def;
3561 // TODO(tmorris): We can also allow the case where min and max are tensors by
3562 // using elementwise min and max layers.
3563 TF_RETURN_IF_ERROR(CheckInputsWeights(
3564 *params,
3565 {{"t", false}, {"clip_value_min", true}, {"clip_value_max", true}}));
3566 TF_RETURN_IF_ERROR(
3567 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3568 if (params->validation_only) return Status::OK();
3569
3570 TFAttrs attrs(node_def);
3571 const DataType dtype = attrs.get<DataType>("T");
3572 float clip_value_min = 0.0f;
3573 float clip_value_max = 0.0f;
3574 // TODO(tmorris): Add a templated helper function to get scalar weights of
3575 // InType casted to OutType.
3576 if (dtype == DataType::DT_FLOAT) {
3577 clip_value_min = inputs.at(1).weights().GetSpan<float>()[0];
3578 clip_value_max = inputs.at(2).weights().GetSpan<float>()[0];
3579 } else if (dtype == DataType::DT_HALF) {
3580 clip_value_min =
3581 static_cast<float>(inputs.at(1).weights().GetSpan<Eigen::half>()[0]);
3582 clip_value_max =
3583 static_cast<float>(inputs.at(2).weights().GetSpan<Eigen::half>()[0]);
3584 }
3585
3586 nvinfer1::IActivationLayer* layer =
3587 params->converter->network()->addActivation(
3588 *inputs.at(0).tensor()->trt_tensor(),
3589 nvinfer1::ActivationType::kCLIP);
3590 layer->setAlpha(clip_value_min);
3591 layer->setBeta(clip_value_max);
3592 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3593 params->converter->SetLayerName(layer, node_def, "activation");
3594 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3595 return Status::OK();
3596 }
3597
3598 const std::unordered_map<string, nvinfer1::ActivationType>*
ActivationTypeMap()3599 ActivationTypeMap() {
3600 static auto* const m =
3601 new std::unordered_map<string, nvinfer1::ActivationType>({
3602 {"Relu", nvinfer1::ActivationType::kRELU},
3603 {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
3604 {"Tanh", nvinfer1::ActivationType::kTANH},
3605 {"Elu", nvinfer1::ActivationType::kELU},
3606 {"Selu", nvinfer1::ActivationType::kSELU},
3607 {"Softsign", nvinfer1::ActivationType::kSOFTSIGN},
3608 {"Softplus", nvinfer1::ActivationType::kSOFTPLUS},
3609 });
3610 return m;
3611 }
3612
ConvertActivation(OpConverterParams * params)3613 Status ConvertActivation(OpConverterParams* params) {
3614 const auto& inputs = params->inputs;
3615 const auto& node_def = params->node_def;
3616 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3617 TF_RETURN_IF_ERROR(
3618 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3619 auto op_pair = ActivationTypeMap()->find(node_def.op());
3620 if (op_pair == ActivationTypeMap()->end()) {
3621 return errors::Unimplemented("Activation op: ", node_def.op(),
3622 " not supported at: ", node_def.name());
3623 }
3624 if (params->validation_only) return Status::OK();
3625
3626 // Start conversion.
3627 nvinfer1::IActivationLayer* layer =
3628 params->converter->network()->addActivation(
3629 *inputs.at(0).tensor()->trt_tensor(), op_pair->second);
3630 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3631 params->converter->SetLayerName(layer, node_def, "activation");
3632 // Set parameters.
3633 if (node_def.op() == "Elu") {
3634 layer->setAlpha(1.0f);
3635 } else if (node_def.op() == "Selu") {
3636 // From tensorflow/core/kernels/relu_op_functor.h
3637 layer->setAlpha(1.7580993408473768599402175208123f);
3638 layer->setBeta(1.0507009873554804934193349852946f);
3639 } else if (node_def.op() == "Softplus") {
3640 layer->setAlpha(1.0f);
3641 layer->setBeta(1.0f);
3642 }
3643 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3644 return Status::OK();
3645 }
3646
ConvertQuantize(OpConverterParams * params)3647 Status ConvertQuantize(OpConverterParams* params) {
3648 const auto& inputs = params->inputs;
3649 const auto& node_def = params->node_def;
3650 if (node_def.op() == "FakeQuantWithMinMaxArgs") {
3651 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3652 } else if (node_def.op() == "FakeQuantWithMinMaxVars") {
3653 TF_RETURN_IF_ERROR(CheckInputsWeights(
3654 *params, {{"input", false}, {"min", true}, {"max", true}}));
3655 } else if (node_def.op() == "QuantizeAndDequantizeV2") {
3656 TF_RETURN_IF_ERROR(CheckInputsWeights(
3657 *params, {{"input", false}, {"input_min", true}, {"input_max", true}}));
3658 } else if (node_def.op() == "QuantizeAndDequantizeV3") {
3659 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
3660 {"input_min", true},
3661 {"input_max", true},
3662 {"num_bits", true}}));
3663 }
3664 float min_range = 0.0f;
3665 float max_range = 0.0f;
3666 if (node_def.op() == "FakeQuantWithMinMaxArgs") {
3667 // Get ranges via node attributes.
3668 TFAttrs attrs(node_def);
3669 if (attrs.count("min") == 0 || attrs.count("max") == 0) {
3670 return errors::InvalidArgument("Min or max attribute not found for ",
3671 node_def.op(), " at ", node_def.name());
3672 }
3673 min_range = attrs.get<float>("min");
3674 max_range = attrs.get<float>("max");
3675 } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
3676 node_def.op() == "QuantizeAndDequantizeV2" ||
3677 node_def.op() == "QuantizeAndDequantizeV3") {
3678 // Get ranges via inputs.
3679 auto get_weights_value = [&inputs](int index) {
3680 auto raw_weights =
3681 static_cast<float*>(inputs.at(index).weights().GetValues());
3682 return raw_weights[0];
3683 };
3684 min_range = get_weights_value(1);
3685 max_range = get_weights_value(2);
3686 } else {
3687 return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
3688 ", at ", node_def.name());
3689 }
3690 if (params->validation_only) return Status::OK();
3691
3692 // Store ranges for tensor
3693 ITensorProxyPtr input0 = inputs.at(0).tensor();
3694 params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
3695 // Sometimes, TRT may not quantize a tensor, either because it chooses to
3696 // execute a higher precision kernel or because of op fusion. In these cases,
3697 // accuracy will suffer if the model was trained to expect quantization at
3698 // that tensor. We should consider adding a clip(tensor, min_range, max_range)
3699 // operation here to ensure that any arbitrarily placed quantize node will
3700 // execute as expected. However, this will negatively affect performance. If
3701 // users train their models in a way which models inference as close as
3702 // possible (i.e. not quantizing in place where fusion will occur), then there
3703 // is no problem with the current implementation.
3704 params->outputs->push_back(inputs.at(0));
3705 return Status::OK();
3706 }
3707
ConvertRelu6(OpConverterParams * params)3708 Status ConvertRelu6(OpConverterParams* params) {
3709 const auto& inputs = params->inputs;
3710 const auto& node_def = params->node_def;
3711 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3712 TF_RETURN_IF_ERROR(
3713 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3714 if (params->validation_only) return Status::OK();
3715
3716 nvinfer1::IActivationLayer* layer =
3717 params->converter->network()->addActivation(
3718 *inputs.at(0).tensor()->trt_tensor(),
3719 nvinfer1::ActivationType::kCLIP);
3720 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3721 layer->setAlpha(0.0f);
3722 layer->setBeta(6.0f);
3723 params->converter->SetLayerName(layer, node_def, "activation");
3724 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3725 return Status::OK();
3726 }
3727
ConvertBiasAddInt8WithoutCalibration(OpConverterParams * params)3728 Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
3729 const auto& inputs = params->inputs;
3730 const auto& node_def = params->node_def;
3731 TF_RETURN_IF_ERROR(
3732 CheckInputsWeights(*params, {{"value", false}, {"bias", true}}));
3733 TF_RETURN_IF_ERROR(
3734 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3735 if (params->validation_only) return Status::OK();
3736
3737 ITensorProxyPtr tensor = inputs.at(0).tensor();
3738 const nvinfer1::Dims original_dims = tensor->getDimensions();
3739 TFAttrs attrs(node_def);
3740 const string data_format = attrs.get<string>("data_format");
3741 const int channel_index =
3742 (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
3743
3744 nvinfer1::Permutation permutation;
3745 if (channel_index != 0) {
3746 // Permute the dimensions so that the channel dimension is the first
3747 // dimension.
3748 for (int i = 0; i < original_dims.nbDims; ++i) {
3749 permutation.order[i] = i;
3750 }
3751 permutation.order[0] = channel_index;
3752 permutation.order[channel_index] = 0;
3753 VLOG(1) << "ConvertBiasAdd permutation: "
3754 << DebugString(permutation, original_dims.nbDims);
3755 }
3756
3757 // TensorRT addScale requires input to be of rank 3, we need to apply
3758 // transpose as well as reshape.
3759 // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
3760 if (channel_index != 0 || original_dims.nbDims != 3) {
3761 nvinfer1::IShuffleLayer* shuffle_layer =
3762 params->converter->network()->addShuffle(*tensor->trt_tensor());
3763 TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
3764 params->converter->SetLayerName(shuffle_layer, node_def, "shuffle",
3765 /*op_instance=*/0);
3766
3767 // NOTE(laigd): for some reason we need to apply the reshape
3768 // unconditionally. The default shape has nbDims==-1 and it seems the
3769 // behavior is undefined in some cases.
3770 nvinfer1::Dims reshape_dims;
3771 reshape_dims.nbDims = 3;
3772 // 0 means copying from input; -1 means inferring from the rest.
3773 reshape_dims.d[0] = 0;
3774 reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
3775 reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
3776 shuffle_layer->setReshapeDimensions(reshape_dims);
3777
3778 if (channel_index != 0) {
3779 shuffle_layer->setFirstTranspose(permutation);
3780 }
3781 tensor = shuffle_layer->getOutput(0);
3782 }
3783
3784 TRT_ShapedWeights weights = inputs.at(1).weights();
3785 nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
3786 if (weights.shape_.d[0] == 1) {
3787 mode = nvinfer1::ScaleMode::kUNIFORM;
3788 }
3789
3790 TRT_ShapedWeights empty_weights(weights.TrtDType());
3791 nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
3792 *tensor->trt_tensor(), mode, weights.GetTrtWeights(),
3793 empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights());
3794 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3795 params->converter->SetLayerName(layer, node_def, "scale");
3796
3797 ITensorProxyPtr output_tensor = layer->getOutput(0);
3798
3799 // Restore transpose & reshape.
3800 if (channel_index != 0 || original_dims.nbDims != 3) {
3801 nvinfer1::IShuffleLayer* shuffle_layer =
3802 params->converter->network()->addShuffle(*output_tensor->trt_tensor());
3803 TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
3804 params->converter->SetLayerName(shuffle_layer, node_def, "shuffle",
3805 /*op_instance=*/1);
3806 // NOTE: for same reason as mentioned above we need to apply the reshape
3807 // unconditionally.
3808 nvinfer1::Dims reshape_dims = original_dims;
3809 if (channel_index != 0) {
3810 // NOTE: according to NVIDIA dimension types are deprecated, so we don't
3811 // need to copy them back.
3812 reshape_dims.d[channel_index] = original_dims.d[0];
3813 reshape_dims.d[0] = original_dims.d[channel_index];
3814 }
3815 shuffle_layer->setReshapeDimensions(reshape_dims);
3816
3817 if (channel_index != 0) {
3818 shuffle_layer->setSecondTranspose(permutation);
3819 }
3820 output_tensor = shuffle_layer->getOutput(0);
3821 }
3822
3823 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3824 return Status::OK();
3825 }
3826
ConvertBiasAdd(OpConverterParams * params)3827 Status ConvertBiasAdd(OpConverterParams* params) {
3828 if (params->precision_mode == TrtPrecisionMode::INT8 &&
3829 !params->use_calibration) {
3830 // NOTE(laigd): based on some observation, it seems TensorRT cannot fuse
3831 // IConvolutionLayer and IElementwiseLayer and will require range
3832 // information for the output of Conv2D. Using IScaleLayer will fix the
3833 // problem.
3834 return ConvertBiasAddInt8WithoutCalibration(params);
3835 }
3836 const auto& inputs = params->inputs;
3837 const auto& node_def = params->node_def;
3838
3839 if (inputs.size() != 2) {
3840 return errors::InvalidArgument(
3841 "BiasAdd expects exactly 2 inputs, but received ", inputs.size());
3842 }
3843
3844 if (inputs[0].is_weights() && inputs[1].is_weights()) {
3845 return errors::InvalidArgument(
3846 "All inputs are weights, but Grappler is expected to fold them.");
3847 }
3848
3849 TF_RETURN_IF_ERROR(
3850 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3851
3852 TFAttrs attrs(node_def);
3853 const string& data_format = attrs.get<string>("data_format");
3854
3855 nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims();
3856 nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims();
3857 // The bias input arg is a 1-D tensor with length C. If the input is NCHW,
3858 // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1].
3859 if (data_format == "NCHW") {
3860 if (params->use_implicit_batch) {
3861 // The batch dim is not included in implicit batch mode, so the shape of
3862 // the bias tensor is [C, 1, 1].
3863 bias_shape.nbDims = input_shape.nbDims;
3864 std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
3865 } else {
3866 // In explicit batch mode we create a tensor with shape [1, C, 1, 1].
3867 std::vector<int> bias_shape_vec(bias_shape.d,
3868 bias_shape.d + bias_shape.nbDims);
3869 // Insert 1 before for batch dim
3870 bias_shape_vec.insert(bias_shape_vec.begin(), 1);
3871 // Trail with 1s to match input_shape size
3872 bias_shape_vec.insert(bias_shape_vec.end(),
3873 input_shape.nbDims - bias_shape_vec.size(), 1);
3874 TF_RETURN_IF_ERROR(ContainerToTrtDims(bias_shape_vec, &bias_shape));
3875 }
3876 } else {
3877 // Next, broadcast the bias across the input.
3878 TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1),
3879 /*check_feasibility=*/true,
3880 params->use_implicit_batch,
3881 &input_shape, &bias_shape));
3882 }
3883
3884 // Convert input to a TRT tensor
3885 ITensorProxyPtr input_tensor{nullptr};
3886 TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(0),
3887 input_shape, params->validation_only,
3888 &input_tensor, node_def,
3889 /*op_instance=*/0));
3890
3891 // Finally, reshape bias. Since the bias is usually a constant, this will
3892 // normally happen at conversion-time.
3893 ITensorProxyPtr bias_tensor{nullptr};
3894 TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(1),
3895 bias_shape, params->validation_only,
3896 &bias_tensor, node_def,
3897 /*op_instance=*/1));
3898 VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape);
3899
3900 if (params->validation_only) return Status::OK();
3901
3902 nvinfer1::IElementWiseLayer* layer =
3903 params->converter->network()->addElementWise(
3904 *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(),
3905 nvinfer1::ElementWiseOperation::kSUM);
3906 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3907 params->converter->SetLayerName(layer, node_def, "sum");
3908 ITensorProxyPtr output_tensor = layer->getOutput(0);
3909
3910 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3911 return Status::OK();
3912 }
3913
GetTensorDimsWithProtoShape(const Tensor & tensor,nvinfer1::Dims * dims)3914 void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
3915 if (tensor.dims() > 0) {
3916 *dims = GetTrtDimsForTensor(tensor);
3917 } else {
3918 dims->nbDims = 0; // Use scalar weights to implement scalar constants.
3919 // No dimension provided. Flatten it.
3920 dims->d[0] = tensor.NumElements();
3921 for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
3922 dims->d[i] = 0;
3923 }
3924 }
3925 }
3926
3927 template <typename Input>
IsIntegerInInt32Bounds(const Input & inp)3928 inline bool IsIntegerInInt32Bounds(const Input& inp) {
3929 static_assert(std::is_integral<Input>::value,
3930 "This function is only implemented for integral types.");
3931 // If Input is always within the range of int32, return true.
3932 if (sizeof(Input) < sizeof(int32) || std::is_same<Input, int32>::value) {
3933 return true;
3934 }
3935 // Otherwise, we need to check the value of the input. If the input is
3936 // unsigned, we only check the upper bound.
3937 if (!std::numeric_limits<Input>::is_signed) {
3938 return inp <= static_cast<Input>(std::numeric_limits<int32>::max());
3939 }
3940 // We can safely cast lowest() here since we now know that Input is signed and
3941 // sizeof(Input) >= sizeof(int32)
3942 return (inp >= static_cast<Input>(std::numeric_limits<int32>::lowest()) &&
3943 inp <= static_cast<Input>(std::numeric_limits<int32>::max()));
3944 }
3945
3946 template <DataType dtype>
CopyToTrtInt32Array(const Tensor & tensor,int32 * dst)3947 Status CopyToTrtInt32Array(const Tensor& tensor, int32* dst) {
3948 typedef typename EnumToDataType<dtype>::Type CType;
3949 const CType* src = tensor.flat<CType>().data();
3950 for (int i = 0; i < tensor.NumElements(); ++i) {
3951 // This becomes a no-op if CType is within bounds of int32
3952 if (!IsIntegerInInt32Bounds(src[i])) {
3953 return errors::InvalidArgument("Value at index ", i,
3954 " is outside the range of int32");
3955 }
3956 dst[i] = static_cast<int32>(src[i]);
3957 }
3958 return Status::OK();
3959 }
3960
TfTensorToTrtWeights(const Tensor & tensor,TrtWeightStore * weight_store,TRT_ShapedWeights * weights)3961 Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
3962 TRT_ShapedWeights* weights) {
3963 const DataType dtype = tensor.dtype();
3964
3965 // We always convert the integer constants to INT32.
3966 //
3967 // TODO(aaroey): FP16 will remain in half format and is not converted to
3968 // FP32, but the converter currently uses all float weights as FP32. Fix
3969 // this.
3970 DataType converted_dtype = DataTypeIsInteger(dtype) ? DT_INT32 : dtype;
3971
3972 // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
3973 nvinfer1::DataType trt_dtype;
3974 TF_RETURN_IF_ERROR(TfTypeToTrtType(converted_dtype, &trt_dtype));
3975
3976 if (tensor.NumElements() == 0) {
3977 // Return empty weights.
3978 *weights = TRT_ShapedWeights(trt_dtype);
3979 return Status::OK();
3980 }
3981
3982 nvinfer1::Dims weight_dims;
3983 GetTensorDimsWithProtoShape(tensor, &weight_dims);
3984 *weights = weight_store->GetTempWeights(trt_dtype, weight_dims);
3985
3986 // Copy the tensor directly if the tensor does not require cast to the
3987 // supported type.
3988 if (converted_dtype == dtype) {
3989 char* dst = static_cast<char*>(weights->GetValues());
3990 memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
3991 return Status::OK();
3992 }
3993
3994 Status status = Status::OK();
3995 // Copy tensor elements after casting them to the converted DataType.
3996 int32* dst = static_cast<int32*>(weights->GetValues());
3997 switch (dtype) {
3998 case DT_INT8:
3999 status = CopyToTrtInt32Array<DT_INT8>(tensor, dst);
4000 break;
4001 case DT_UINT8:
4002 status = CopyToTrtInt32Array<DT_UINT8>(tensor, dst);
4003 break;
4004 case DT_INT16:
4005 status = CopyToTrtInt32Array<DT_INT16>(tensor, dst);
4006 break;
4007 case DT_UINT16:
4008 status = CopyToTrtInt32Array<DT_UINT16>(tensor, dst);
4009 break;
4010 case DT_UINT32:
4011 status = CopyToTrtInt32Array<DT_UINT32>(tensor, dst);
4012 break;
4013 case DT_INT64:
4014 status = CopyToTrtInt32Array<DT_INT64>(tensor, dst);
4015 break;
4016 case DT_UINT64:
4017 status = CopyToTrtInt32Array<DT_UINT64>(tensor, dst);
4018 break;
4019 default:
4020 return errors::Internal("Unexpected DataType: ", DataTypeString(dtype));
4021 }
4022 return status;
4023 }
4024
4025 // Convert a Const NodeDef to TRT_ShapedWeights. This is a special converter, it
4026 // always ignores the params->validation_only parameter but adds the converted
4027 // weights to params->outputs. We did this since TrtNodeValidator needs the
4028 // weights as input to other nodes, and use it to determine whether those nodes
4029 // are supported by TRT.
ConvertConst(OpConverterParams * params)4030 Status ConvertConst(OpConverterParams* params) {
4031 const auto& inputs = params->inputs;
4032 const auto& node_def = params->node_def;
4033 if (!inputs.empty()) {
4034 return errors::InvalidArgument(
4035 "Constant node is expected to have empty input list: ",
4036 node_def.name());
4037 }
4038
4039 // Create shaped weights as output
4040 const auto& tensor_proto = node_def.attr().at("value").tensor();
4041 Tensor tensor;
4042 if (!tensor.FromProto(tensor_proto)) {
4043 return errors::Internal("Cannot parse weight tensor proto: ",
4044 node_def.name());
4045 }
4046
4047 TFAttrs attrs(node_def);
4048 const DataType dtype = attrs.get<DataType>("dtype");
4049 if (dtype != tensor.dtype()) {
4050 return errors::InvalidArgument("DataType mismatch between attr (",
4051 DataTypeString(dtype), ") and tensor (",
4052 DataTypeString(tensor.dtype()), ")");
4053 }
4054
4055 TRT_ShapedWeights weights;
4056 TF_RETURN_IF_ERROR(
4057 TfTensorToTrtWeights(tensor, params->weight_store, &weights));
4058
4059 if (params->outputs != nullptr) {
4060 params->outputs->push_back(TRT_TensorOrWeights(weights));
4061 }
4062 return Status::OK();
4063 }
4064
ConvertIdentity(OpConverterParams * params)4065 Status ConvertIdentity(OpConverterParams* params) {
4066 // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
4067 // 5.0, however once we know that it does it would be nice to use that
4068 // instead.
4069 if (params->validation_only) return Status::OK();
4070 params->outputs->push_back(params->inputs.at(0));
4071 return Status::OK();
4072 }
4073
4074 const std::unordered_map<string, nvinfer1::ElementWiseOperation>*
BinaryOperationMap()4075 BinaryOperationMap() {
4076 static auto* const m =
4077 new std::unordered_map<string, nvinfer1::ElementWiseOperation> {
4078 {"Add", nvinfer1::ElementWiseOperation::kSUM},
4079 {"AddV2", nvinfer1::ElementWiseOperation::kSUM},
4080 {"Mul", nvinfer1::ElementWiseOperation::kPROD},
4081 {"Sub", nvinfer1::ElementWiseOperation::kSUB},
4082 {"Div", nvinfer1::ElementWiseOperation::kDIV},
4083 {"FloorDiv", nvinfer1::ElementWiseOperation::kFLOOR_DIV},
4084 {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
4085 {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
4086 {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
4087 {"Pow", nvinfer1::ElementWiseOperation::kPOW},
4088 };
4089 return m;
4090 }
4091
ConvertBinary(OpConverterParams * params)4092 Status ConvertBinary(OpConverterParams* params) {
4093 const auto& inputs = params->inputs;
4094 const auto& node_def = params->node_def;
4095 if (inputs.size() != 2) {
4096 return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
4097 " inputs but expected 2, at ",
4098 node_def.name());
4099 }
4100 std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4101 DataType::DT_INT32};
4102 TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4103
4104 // Constant folding should have been done by TensorFlow
4105 if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
4106 return errors::Unimplemented(
4107 "Constant folding is falled back to TensorFlow, binary op received "
4108 "both input as constant at: ",
4109 node_def.name());
4110 }
4111 const TRT_TensorOrWeights& operand_l = inputs.at(0);
4112 const TRT_TensorOrWeights& operand_r = inputs.at(1);
4113
4114 auto op_pair = BinaryOperationMap()->find(node_def.op());
4115 if (op_pair == BinaryOperationMap()->end()) {
4116 return errors::Unimplemented("Binary op ", node_def.op(),
4117 " not supported at: ", node_def.name());
4118 }
4119
4120 nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
4121 TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
4122 operand_l, operand_r, /*check_feasibility=*/true,
4123 params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
4124 ITensorProxyPtr tensor_l = nullptr;
4125 ITensorProxyPtr tensor_r = nullptr;
4126 // This will also convert constants to tensors.
4127 TF_RETURN_IF_ERROR(PrepareTensorForShape(
4128 params->converter, operand_l, broadcasted_dims_l, params->validation_only,
4129 &tensor_l, node_def, /*op_instance=*/0));
4130 TF_RETURN_IF_ERROR(PrepareTensorForShape(
4131 params->converter, operand_r, broadcasted_dims_r, params->validation_only,
4132 &tensor_r, node_def, /*op_instance=*/1));
4133 if (params->validation_only) return Status::OK();
4134
4135 // Add ElementWise layer.
4136 nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
4137 *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), op_pair->second);
4138 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4139 params->converter->SetLayerName(layer, node_def);
4140 ITensorProxyPtr trt_tensor = layer->getOutput(0);
4141
4142 params->outputs->push_back(TRT_TensorOrWeights(trt_tensor));
4143 return Status::OK();
4144 }
4145
ConvertRsqrt(OpConverterParams * params)4146 Status ConvertRsqrt(OpConverterParams* params) {
4147 const auto& inputs = params->inputs;
4148 const auto& node_def = params->node_def;
4149 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4150 TF_RETURN_IF_ERROR(
4151 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4152 if (params->validation_only) return Status::OK();
4153
4154 // Start conversion.
4155 ITensorProxyPtr tensor = inputs.at(0).tensor();
4156 // Sqrt
4157 nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
4158 *tensor->trt_tensor(), nvinfer1::UnaryOperation::kSQRT);
4159 TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
4160 params->converter->SetLayerName(sqrt_layer, node_def, "sqrt");
4161 // Recip
4162 nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
4163 *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
4164 TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
4165 params->converter->SetLayerName(recip_layer, node_def, "recip");
4166 params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
4167 return Status::OK();
4168 }
4169
4170 const std::unordered_map<string, nvinfer1::UnaryOperation>*
UnaryOperationMap()4171 UnaryOperationMap() {
4172 static auto* const m =
4173 new std::unordered_map<string, nvinfer1::UnaryOperation>({
4174 {"Neg", nvinfer1::UnaryOperation::kNEG},
4175 {"Exp", nvinfer1::UnaryOperation::kEXP},
4176 {"Log", nvinfer1::UnaryOperation::kLOG},
4177 {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
4178 {"Abs", nvinfer1::UnaryOperation::kABS},
4179 {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
4180 {"Sin", nvinfer1::UnaryOperation::kSIN},
4181 {"Cos", nvinfer1::UnaryOperation::kCOS},
4182 {"Tan", nvinfer1::UnaryOperation::kTAN},
4183 {"Sinh", nvinfer1::UnaryOperation::kSINH},
4184 {"Cosh", nvinfer1::UnaryOperation::kCOSH},
4185 {"Asin", nvinfer1::UnaryOperation::kASIN},
4186 {"Acos", nvinfer1::UnaryOperation::kACOS},
4187 {"Atan", nvinfer1::UnaryOperation::kATAN},
4188 {"Asinh", nvinfer1::UnaryOperation::kASINH},
4189 {"Acosh", nvinfer1::UnaryOperation::kACOSH},
4190 {"Atanh", nvinfer1::UnaryOperation::kATANH},
4191 {"Ceil", nvinfer1::UnaryOperation::kCEIL},
4192 {"Floor", nvinfer1::UnaryOperation::kFLOOR},
4193 {"Erf", nvinfer1::UnaryOperation::kERF},
4194 });
4195 return m;
4196 }
4197
ConvertUnary(OpConverterParams * params)4198 Status ConvertUnary(OpConverterParams* params) {
4199 const auto& inputs = params->inputs;
4200 const auto& node_def = params->node_def;
4201 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4202 TF_RETURN_IF_ERROR(
4203 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4204 auto op_pair = UnaryOperationMap()->find(node_def.op());
4205 if (op_pair == UnaryOperationMap()->end()) {
4206 return errors::Unimplemented("Unary op: ", node_def.op(),
4207 " not supported at: ", node_def.name());
4208 }
4209 if (params->validation_only) return Status::OK();
4210
4211 // Start conversion.
4212 ITensorProxyPtr tensor = inputs.at(0).tensor();
4213 nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
4214 *tensor->trt_tensor(), op_pair->second);
4215 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4216 params->converter->SetLayerName(layer, node_def);
4217 ITensorProxyPtr output_tensor = layer->getOutput(0);
4218
4219 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4220 return Status::OK();
4221 }
4222
ConvertSquare(OpConverterParams * params)4223 Status ConvertSquare(OpConverterParams* params) {
4224 const auto& inputs = params->inputs;
4225 const auto& node_def = params->node_def;
4226 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4227 TF_RETURN_IF_ERROR(AllowDataTypes(
4228 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
4229 if (params->validation_only) return Status::OK();
4230
4231 // Constant 2 with same rank as input
4232 ITensorProxyPtr const2_tensor = nullptr;
4233 TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
4234 params, 2.0f, inputs.at(0).GetTrtDims(), &const2_tensor));
4235
4236 // ElementWise Pow Operation
4237 nvinfer1::IElementWiseLayer* layer =
4238 params->converter->network()->addElementWise(
4239 *inputs.at(0).tensor()->trt_tensor(), *const2_tensor->trt_tensor(),
4240 nvinfer1::ElementWiseOperation::kPOW);
4241 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4242 params->converter->SetLayerName(layer, node_def);
4243 ITensorProxyPtr output_tensor = layer->getOutput(0);
4244
4245 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4246 return Status::OK();
4247 }
4248
ConvertReduce(OpConverterParams * params)4249 Status ConvertReduce(OpConverterParams* params) {
4250 const auto& inputs = params->inputs;
4251 const auto& node_def = params->node_def;
4252 TF_RETURN_IF_ERROR(
4253 CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
4254 TF_RETURN_IF_ERROR(
4255 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4256
4257 ITensorProxyPtr tensor = inputs.at(0).tensor();
4258 auto tf_axes_list = inputs.at(1).weights().GetSpan<int>();
4259
4260 TFAttrs attrs(node_def);
4261 // Only expect to handle INT32 as attributes for now
4262 if (attrs.get<DataType>("Tidx") != DataType::DT_INT32) {
4263 return errors::Unimplemented("Tidx supports only DT_INT32");
4264 }
4265
4266 int axes = 0;
4267 if (tf_axes_list.size() == 0) {
4268 return errors::InvalidArgument(
4269 "TRT cannot support reduce on all (batch) dimensions, at",
4270 node_def.name());
4271 }
4272 for (int i = 0; i < tf_axes_list.size(); i++) {
4273 int trt_axis;
4274 TF_RETURN_IF_ERROR(
4275 ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims,
4276 node_def.name(), params->use_implicit_batch, &trt_axis));
4277 axes |= (1 << trt_axis);
4278 }
4279
4280 nvinfer1::ReduceOperation reduce_operation;
4281 if (node_def.op() == "Sum") {
4282 reduce_operation = nvinfer1::ReduceOperation::kSUM;
4283 } else if (node_def.op() == "Prod") {
4284 reduce_operation = nvinfer1::ReduceOperation::kPROD;
4285 } else if (node_def.op() == "Max") {
4286 reduce_operation = nvinfer1::ReduceOperation::kMAX;
4287 } else if (node_def.op() == "Min") {
4288 reduce_operation = nvinfer1::ReduceOperation::kMIN;
4289 } else if (node_def.op() == "Mean") {
4290 reduce_operation = nvinfer1::ReduceOperation::kAVG;
4291 } else {
4292 return errors::Unimplemented("Op not supported ", node_def.op(), ", at ",
4293 node_def.name());
4294 }
4295 if (params->validation_only) return Status::OK();
4296
4297 const auto keep_dims = attrs.get<bool>("keep_dims");
4298 nvinfer1::ILayer* layer = params->converter->network()->addReduce(
4299 *tensor->trt_tensor(), reduce_operation, axes, keep_dims);
4300 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4301 params->converter->SetLayerName(layer, node_def);
4302
4303 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4304 return Status::OK();
4305 }
4306
4307 // TensorRT does not support the Pack op natively. Therefore, Pack op is
4308 // converted by first expanding input tensors by adding a new dimension of size
4309 // one at the specified axis and then concatenating the tensors at the same
4310 // axis.
ConvertPack(OpConverterParams * params)4311 Status ConvertPack(OpConverterParams* params) {
4312 const auto& inputs = params->inputs;
4313 const auto& node_def = params->node_def;
4314
4315 TFAttrs attrs(node_def);
4316 const int num_inputs = attrs.get<int64>("N");
4317 if (num_inputs != inputs.size()) {
4318 return errors::InvalidArgument(
4319 "Number of inputs for Pack is inconsistent with N attribute, at ",
4320 node_def.name());
4321 }
4322
4323 // In implicit batch mode we do not allow weight input. An input tensor with
4324 // dims NCHW is represented with dims CHW during conversion time, and N is
4325 // defined only during runtime. A weight is represented with dims NCHW. We
4326 // cannot be sure that the runtime N will agree with the conversion time N,
4327 // therefore we do not convert the pack op if it has both tensor and weight
4328 // inputs. This restriction does not apply in explicit batch mode, in that
4329 // case the input tensors are also represented with full dims that include the
4330 // batch size.
4331 TrtInputArg expected_arg =
4332 params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth;
4333
4334 std::vector<std::pair<string, TrtInputArg>> inputs_is_weight;
4335 inputs_is_weight.reserve(num_inputs);
4336 for (int i = 0; i < num_inputs; ++i) {
4337 inputs_is_weight.push_back({StrCat("values_", i), expected_arg});
4338 }
4339 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight));
4340
4341 std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4342 DataType::DT_INT32};
4343 TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4344 if (num_inputs > 1) {
4345 // Verify that inputs are compatible for concatenation after the expansion.
4346 TF_RETURN_IF_ERROR(
4347 VerifyShapesMatch(inputs, /*masked_dim=*/-1, node_def.name()));
4348 }
4349
4350 // Find the dimension of the inputs. In general inputs can have dynamic shape,
4351 // in that case we have to use DynamicExpandDims to calculate the expanded
4352 // dimensions. To avoid that, we try to find a weight input which is
4353 // guaranteed to have known static shape.
4354 int idx = 0;
4355 for (int i = 1; i < inputs.size(); i++) {
4356 if (HasStaticShape(inputs.at(i).GetTrtDims())) {
4357 idx = i;
4358 }
4359 }
4360 const nvinfer1::Dims dims = inputs.at(idx).GetTrtDims();
4361 // Convert axis from the TensorFlow format to TensorRT format.
4362 const int64 tf_axis = attrs.get<int64>("axis");
4363 int trt_axis;
4364 TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims + 1, node_def.name(),
4365 params->use_implicit_batch, &trt_axis));
4366
4367 // Compute expanded dimensions and then reshape input tensors.
4368 std::vector<int> tensor_dims(dims.d, dims.d + dims.nbDims);
4369 tensor_dims.insert(tensor_dims.begin() + trt_axis, 1);
4370 nvinfer1::Dims expanded_dims;
4371 TF_RETURN_IF_ERROR(ContainerToTrtDims(tensor_dims, &expanded_dims));
4372 std::vector<ITensorProxyPtr> expanded_tensors;
4373 int input_index = 0;
4374 for (const TRT_TensorOrWeights& input : inputs) {
4375 ITensorProxyPtr expanded_tensor = nullptr;
4376 if (input.is_tensor() && !params->use_implicit_batch &&
4377 !HasStaticShape(dims)) {
4378 if (!params->validation_only) {
4379 TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
4380 input.tensor(), dims, trt_axis, params, &expanded_tensor,
4381 input_index));
4382 }
4383 } else {
4384 TF_RETURN_IF_ERROR(PrepareTensorForShape(
4385 params->converter, input, expanded_dims, params->validation_only,
4386 &expanded_tensor, node_def, input_index));
4387 }
4388 if (!params->validation_only) {
4389 expanded_tensors.push_back(expanded_tensor);
4390 }
4391 input_index++;
4392 }
4393 if (params->validation_only) return Status::OK();
4394
4395 // If there is only one tensor in the input, return the expanded tensor.
4396 if (num_inputs == 1) {
4397 params->outputs->push_back(TRT_TensorOrWeights(expanded_tensors[0]));
4398 return Status::OK();
4399 }
4400
4401 // Otherwise, concatenate expanded tensors.
4402 std::vector<nvinfer1::ITensor*> trt_expanded_tensors;
4403 for (const auto& t : expanded_tensors) {
4404 trt_expanded_tensors.push_back(t->trt_tensor());
4405 }
4406 nvinfer1::IConcatenationLayer* layer =
4407 params->converter->network()->addConcatenation(
4408 static_cast<nvinfer1::ITensor* const*>(trt_expanded_tensors.data()),
4409 expanded_tensors.size());
4410 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4411 params->converter->SetLayerName(layer, node_def, "concat");
4412 // Note that trt_axis stays the same even after expanding tensors at the axis.
4413 layer->setAxis(trt_axis);
4414 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4415 return Status::OK();
4416 }
4417
ConvertPad(OpConverterParams * params)4418 Status ConvertPad(OpConverterParams* params) {
4419 const auto& inputs = params->inputs;
4420 const auto& node_def = params->node_def;
4421 TF_RETURN_IF_ERROR(
4422 CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}}));
4423 TF_RETURN_IF_ERROR(AllowDataTypes(
4424 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT8}));
4425
4426 // Implement tensor binaryOp weight [channel wise] for now;
4427 ITensorProxyPtr tensor = inputs.at(0).tensor();
4428 const auto dims = tensor->getDimensions();
4429 // Restore implicit batch dimension
4430 const int nb_dims =
4431 params->use_implicit_batch ? dims.nbDims + 1 : dims.nbDims;
4432
4433 // TODO(tfeher): Support nb_dims < 4 by inserting extra dimensions to the
4434 // original input.
4435 if (nb_dims < 4) {
4436 return errors::InvalidArgument("Convertpad requires at least 4D input, at ",
4437 node_def.name());
4438 }
4439 TRT_ShapedWeights pads = inputs.at(1).weights();
4440
4441 TFAttrs attrs(node_def);
4442 // Padding type here is done through TF type
4443 // so I can leverage their EnumToDataType for my cast
4444 auto padding_type = attrs.get<DataType>("Tpaddings");
4445 // TODO(jie): handle data type conversion for TRT?
4446
4447 if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) {
4448 return errors::InvalidArgument("Paddings at ", node_def.name(),
4449 " must be a weight with shape [n, 2], "
4450 "where n is the rank of input tensor");
4451 }
4452
4453 // Only expect to handle INT32 as attributes for now
4454 if (padding_type != DataType::DT_INT32) {
4455 return errors::Unimplemented("Tpaddings supports only DT_INT32");
4456 }
4457 auto pad_data = static_cast<int*>(pads.GetValues());
4458
4459 std::vector<int32_t> tf_pad_index;
4460 for (int i = 0; i < nb_dims; i++) {
4461 if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) {
4462 tf_pad_index.push_back(i);
4463 }
4464 }
4465
4466 // No padding at all, we should exit
4467 if (tf_pad_index.empty()) {
4468 params->outputs->push_back(inputs.at(0));
4469 return Status::OK();
4470 }
4471
4472 // TRT pad layer can only support padding on up to 2 dimensions (TRT-2579).
4473 // TODO(tfeher): Use multiple TRT pad layers to support padding on more than 2
4474 // dimensions.
4475 if (tf_pad_index.size() > 2) {
4476 return errors::InvalidArgument(
4477 "Padding layer does not support padding on > 2");
4478 }
4479
4480 // Padding on batch dimension is not supported
4481 if (params->use_implicit_batch && tf_pad_index[0] == 0) {
4482 return errors::InvalidArgument(
4483 "Padding layer does not support padding on batch dimension");
4484 }
4485
4486 if (params->validation_only) return Status::OK();
4487
4488 // TRT can only do the padding at the last two dimensions. We transpose the
4489 // input tensor if needed.
4490 bool transposed_pad = false;
4491 std::vector<int> transpose_idx(nb_dims);
4492 std::iota(transpose_idx.begin(), transpose_idx.end(), 0);
4493
4494 // trt_pad_index denotes the actual idx where the padding is performed by TRT.
4495 std::vector<int> trt_pad_index{nb_dims - 2, nb_dims - 1};
4496
4497 // How many zeros are padded at the last two dimensions.
4498 nvinfer1::DimsHW pre_padding(0, 0);
4499 nvinfer1::DimsHW post_padding(0, 0);
4500
4501 // Dimension to set in the pre_padding and post_padding array.
4502 std::vector<int> trt_pre_post_padding_index{0, 1};
4503
4504 // Two special cases where we can avoid permutations.
4505 if (tf_pad_index.size() == 1 && tf_pad_index[0] == nb_dims - 1) {
4506 // Only one dimension needs to be padded. We store its index at
4507 // trt_pad_index[0]. We ignore trt_pad_index[1].
4508 trt_pad_index[0] = nb_dims - 1;
4509 trt_pre_post_padding_index[0] = 1;
4510 }
4511 if (tf_pad_index.size() == 2 && tf_pad_index[1] == nb_dims - 2) {
4512 // tf_pad_index only has two values that are in ascending order. If
4513 // tf_pad_index[1] is nb_dims-2, then swapping the two values in
4514 // trt_pad_index here makes it possible to only swap one pair of dimensions
4515 // (swap tf_pad_index[0] with nb_dims-1) in the input tensor. Otherwise, we
4516 // would have to swap two pairs of dimensions in the input tensor:
4517 // (tf_pad_index[0] with nb_dims-2) and (tf_pad_index[1], with nb_dims-1).
4518 // Here is an example for a 4D input tensor:
4519 // tf_pad_index = [1, 2]
4520 // trt_pad_index = [3, 2]
4521 // transpose_idx = [0, 3, 2, 1]
4522 std::swap(trt_pad_index[0], trt_pad_index[1]);
4523 std::swap(trt_pre_post_padding_index[0], trt_pre_post_padding_index[1]);
4524 }
4525
4526 for (int i = 0; i < tf_pad_index.size(); i++) {
4527 const int tf_index = tf_pad_index[i];
4528 const int trt_index = trt_pad_index[i];
4529 const int k = trt_pre_post_padding_index[i];
4530 pre_padding.d[k] = pad_data[tf_index * 2];
4531 post_padding.d[k] = pad_data[tf_index * 2 + 1];
4532 if (tf_index != trt_index) {
4533 transposed_pad = true;
4534 std::swap(transpose_idx[tf_index], transpose_idx[trt_index]);
4535 }
4536 }
4537
4538 if (transposed_pad) {
4539 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
4540 tensor, transpose_idx, &tensor, node_def, "to_pad"));
4541 }
4542
4543 nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
4544 *tensor->trt_tensor(), pre_padding, post_padding);
4545 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4546 params->converter->SetLayerName(layer, node_def);
4547 ITensorProxyPtr output_tensor = layer->getOutput(0);
4548
4549 if (transposed_pad) {
4550 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
4551 output_tensor, transpose_idx, &output_tensor, node_def, "from_pad"));
4552 }
4553
4554 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4555 return Status::OK();
4556 }
4557
ConvertSplitHelper(OpConverterParams * params,const TRT_TensorOrWeights & input,int tf_axis,int num_splits,bool squeeze_after)4558 Status ConvertSplitHelper(OpConverterParams* params,
4559 const TRT_TensorOrWeights& input, int tf_axis,
4560 int num_splits, bool squeeze_after) {
4561 const auto& node_def = params->node_def;
4562 const nvinfer1::Dims dims = input.GetTrtDims();
4563 // Convert axis.
4564 int trt_axis;
4565 TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
4566 params->use_implicit_batch, &trt_axis));
4567
4568 if (dims.d[trt_axis] < 0) {
4569 return errors::InvalidArgument(
4570 "Dimension ", tf_axis, " must have statically defined dimensions, at ",
4571 node_def.name());
4572 }
4573
4574 // Dimension must equal num_splits for Unstack (when squeeze_after is true)
4575 if (squeeze_after && dims.d[trt_axis] != num_splits) {
4576 return errors::InvalidArgument(
4577 "Dimension ", tf_axis, " has size ", dims.d[trt_axis],
4578 " which is not equal to num of ", num_splits, ", at ", node_def.name());
4579 }
4580 // Dimension must be evenly divisible by num_splits.
4581 if (dims.d[trt_axis] % num_splits != 0) {
4582 return errors::InvalidArgument(
4583 "Dimension ", tf_axis, " of size ", dims.d[trt_axis],
4584 " is not evenly divisible by ", num_splits, ", at ", node_def.name());
4585 }
4586
4587 // Create parameters for StridedSliceHelper.
4588 // Slice will begin on zero for all dims, except the one being split which
4589 // will change.
4590 std::vector<int> begin(dims.nbDims, 0);
4591 // Determine size of split. Slice will get the full length of all dims, except
4592 // the one being split. Undefined dims (-1) will translate to a size of -1
4593 // which will tell StridedSlice to take full length of that dim.
4594 std::vector<int> size(dims.d, dims.d + dims.nbDims);
4595 const int split_size_on_axis = dims.d[trt_axis] / num_splits;
4596 size[trt_axis] = split_size_on_axis;
4597 // Stride will always be 1
4598 std::vector<int> stride(dims.nbDims, 1);
4599 // Add dummy batch dimension
4600 if (params->use_implicit_batch) {
4601 begin.insert(begin.begin(), 0);
4602 size.insert(size.begin(), 1);
4603 stride.insert(stride.begin(), 1);
4604 }
4605 // Create final shape for Unpack/Unstack, where split axis is squeezed.
4606 nvinfer1::Dims final_shape_for_unpack;
4607 nvinfer1::Dims* final_shape_for_unpack_ptr = nullptr;
4608
4609 // We can't use final_shape_for_unpack_ptr when input dimensions are not
4610 // fully defined.
4611 const bool is_dynamic_shape = !HasStaticShape(dims);
4612 if (squeeze_after && !is_dynamic_shape) {
4613 std::vector<int> size_after_squeeze(size);
4614 const int tf_axis = trt_axis + (params->use_implicit_batch ? 1 : 0);
4615 size_after_squeeze.erase(size_after_squeeze.begin() + tf_axis);
4616 TF_RETURN_IF_ERROR(ContainerToTrtDims(size_after_squeeze,
4617 &final_shape_for_unpack,
4618 /*ignore_first_dim=*/
4619 params->use_implicit_batch));
4620 final_shape_for_unpack_ptr = &final_shape_for_unpack;
4621 }
4622
4623 // Slice the input. ConvertStridedSliceHelper will push the outputs onto
4624 // params->outputs.
4625 for (int i = 0; i < num_splits; ++i) {
4626 const int tf_axis = trt_axis + (params->use_implicit_batch ? 1 : 0);
4627 begin[tf_axis] = i * split_size_on_axis;
4628 TF_RETURN_IF_ERROR(ConvertStridedSliceHelper(
4629 params, input, begin, size, stride, final_shape_for_unpack_ptr,
4630 /*op_instance=*/i));
4631 }
4632 if (params->validation_only) return Status::OK();
4633
4634 // Squeeze for dynamic shapes
4635 if (squeeze_after && is_dynamic_shape) {
4636 for (int i = 0; i < params->outputs->size(); i++) {
4637 ITensorProxyPtr output_tensor = nullptr;
4638 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
4639 input_dims[trt_axis] = 0;
4640 TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
4641 params->outputs->at(i).tensor(), &input_dims, params,
4642 &output_tensor));
4643 (*params->outputs)[i] = TRT_TensorOrWeights(output_tensor);
4644 }
4645 }
4646 return Status::OK();
4647 }
4648
ConvertSplit(OpConverterParams * params)4649 Status ConvertSplit(OpConverterParams* params) {
4650 const auto& inputs = params->inputs;
4651 const auto& node_def = params->node_def;
4652 TF_RETURN_IF_ERROR(
4653 CheckInputsWeights(*params, {{"axis", true}, {"value", false}}));
4654 TF_RETURN_IF_ERROR(AllowDataTypes(*params, {
4655 DataType::DT_FLOAT,
4656 DataType::DT_HALF,
4657 DataType::DT_INT32,
4658 }));
4659 int tf_axis = inputs.at(0).weights().GetSpan<int>()[0];
4660 TFAttrs attrs(node_def);
4661 const int num_split = attrs.get<int64>("num_split");
4662
4663 return ConvertSplitHelper(params, inputs.at(1), tf_axis, num_split, false);
4664 }
4665
ConvertUnpack(OpConverterParams * params)4666 Status ConvertUnpack(OpConverterParams* params) {
4667 const auto& inputs = params->inputs;
4668 const auto& node_def = params->node_def;
4669 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"value", false}}));
4670 TF_RETURN_IF_ERROR(AllowDataTypes(*params, {
4671 DataType::DT_FLOAT,
4672 DataType::DT_HALF,
4673 DataType::DT_INT32,
4674 }));
4675 // Input must be rank 1 or higher, since we can't unpack on axis 0.
4676 if (inputs.at(0).GetTrtDims().nbDims == 0) {
4677 return errors::Unimplemented(
4678 "Input \"value\" for Unpack must be rank 2 or greater, at ",
4679 node_def.name());
4680 }
4681 TFAttrs attrs(node_def);
4682 const int tf_axis = attrs.get<int64>("axis");
4683 const int num = attrs.get<int64>("num");
4684
4685 return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
4686 }
4687
4688 // Supports cast fp16=>fp32 through IIdentityLayer.
ConvertCast(OpConverterParams * params)4689 Status ConvertCast(OpConverterParams* params) {
4690 const NodeDef& node_def = params->node_def;
4691 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4692 auto unsupport_cast_error = [&]() {
4693 return errors::Unimplemented("Cast op: ", node_def.op(),
4694 " not supported at: ", node_def.name());
4695 };
4696
4697 DataType input_type;
4698 TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
4699 if (input_type != DataType::DT_HALF) {
4700 return unsupport_cast_error();
4701 }
4702
4703 DataType output_type;
4704 TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type,
4705 kCastOutputTypeAttrName));
4706
4707 if (output_type != DataType::DT_FLOAT) {
4708 return unsupport_cast_error();
4709 }
4710
4711 if (params->validation_only) return Status::OK();
4712
4713 ITensorProxyPtr input = params->inputs.at(0).tensor();
4714 nvinfer1::IIdentityLayer* layer =
4715 params->converter->network()->addIdentity(*input->trt_tensor());
4716 params->converter->SetLayerName(layer, node_def);
4717 layer->setPrecision(nvinfer1::DataType::kFLOAT);
4718
4719 if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
4720 return errors::Internal("IIdentityLayer doesn't work as expected");
4721 }
4722
4723 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4724 return Status::OK();
4725 }
4726
ConvertConcat(OpConverterParams * params)4727 Status ConvertConcat(OpConverterParams* params) {
4728 const auto& inputs = params->inputs;
4729 const auto& node_def = params->node_def;
4730 TFAttrs attrs(node_def);
4731 // Get number of tensor inputs.
4732 const int num_inputs = attrs.get<int64>("N");
4733 if (num_inputs != static_cast<int>(inputs.size()) - 1) {
4734 return errors::InvalidArgument(
4735 "Number of inputs for ConcatV2 is inconsistent with N attribute, at ",
4736 node_def.name());
4737 }
4738 // Validate inputs. Values must be tensors for now, although it would be
4739 // possible to accept weights in explicit batch mode. See CheckInputsWeights
4740 // for details. TODO(tfeher): Allow weight input in explicit batch mode.
4741 std::vector<std::pair<string, TrtInputArg>> inputs_kinds;
4742 TrtInputArg expected_input = TrtInputArg::kTensor;
4743 inputs_kinds.reserve(num_inputs);
4744 for (int i = 0; i < num_inputs; ++i) {
4745 inputs_kinds.push_back({StrCat("values_", i), expected_input});
4746 }
4747 inputs_kinds.push_back({"axis", TrtInputArg::kWeight});
4748 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_kinds));
4749
4750 std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4751 DataType::DT_INT32};
4752
4753 TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4754 const auto axis = inputs.at(num_inputs).weights().GetSpan<int>();
4755 if (axis.size() != 1) {
4756 return errors::InvalidArgument("Axis for ConcatV2 must be a scalar, at ",
4757 node_def.name());
4758 }
4759 int trt_axis = 0;
4760 const auto dim = inputs.at(0).GetTrtDims();
4761 TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dim.nbDims, node_def.name(),
4762 params->use_implicit_batch, &trt_axis));
4763 // Check that dimensions match on non-concatenate axis.
4764 TF_RETURN_IF_ERROR(VerifyShapesMatch(
4765 absl::Span<const TRT_TensorOrWeights>(inputs).first(num_inputs), trt_axis,
4766 node_def.name()));
4767 if (params->validation_only) return Status::OK();
4768
4769 // Gather inputs as tensors
4770 std::vector<ITensorProxyPtr> input_tensors;
4771 input_tensors.reserve(num_inputs);
4772 for (int i = 0; i < num_inputs; i++) {
4773 input_tensors.push_back(inputs.at(i).tensor());
4774 }
4775 std::vector<nvinfer1::ITensor*> trt_input_tensors;
4776 for (const auto& t : input_tensors) {
4777 trt_input_tensors.push_back(t->trt_tensor());
4778 }
4779 nvinfer1::IConcatenationLayer* layer =
4780 params->converter->network()->addConcatenation(
4781 static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
4782 input_tensors.size());
4783 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4784 params->converter->SetLayerName(layer, node_def);
4785 layer->setAxis(trt_axis);
4786 params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4787 return Status::OK();
4788 }
4789
ConvertFusedBatchNorm(OpConverterParams * params)4790 Status ConvertFusedBatchNorm(OpConverterParams* params) {
4791 const auto& inputs = params->inputs;
4792 const auto& node_def = params->node_def;
4793 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false},
4794 {"scale", true},
4795 {"offset", true},
4796 {"mean", true},
4797 {"variance", true}}));
4798 TF_RETURN_IF_ERROR(
4799 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4800 TFAttrs attrs(node_def);
4801 float epsilon = attrs.get<float>("epsilon");
4802 auto data_format = attrs.get<string>("data_format");
4803 if (data_format != "NCHW") {
4804 return errors::Unimplemented(
4805 node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
4806 }
4807 bool is_training = attrs.get<bool>("is_training");
4808 if (is_training) {
4809 // Trying to use batchnorm in training mode is a very common problem.
4810 // Because the error message will only be printed in VLOG(1) by the
4811 // segmenter, we issue a special warning so that users will actually see it.
4812 LOG_WARNING_WITH_PREFIX
4813 << node_def.op() << " only supports is_training=false. If you "
4814 << "are using Keras, please call "
4815 << "keras.backend.set_learning_phase(0) before constructing "
4816 << "your model. At " << node_def.name();
4817 return errors::Unimplemented(node_def.op(),
4818 " only supports is_training=false, at ",
4819 node_def.name());
4820 }
4821 ITensorProxyPtr tensor = inputs.at(0).tensor();
4822 if (!params->use_implicit_batch && tensor->getDimensions().d[1] == -1) {
4823 // This check is to make sure that channel dimension is known during
4824 // conversion.
4825 //
4826 // We check this only in explicit batch mode and reject an op with unknown
4827 // channel dimension during segmentation. In implicit batch mode we have
4828 // known shapes during conversion even though the shapes may not be known
4829 // during segmentation (see the actual argument for input_shapes when
4830 // ConvertGraphDefToEngine is called from TRTEngineOp::BuildEngine).
4831 return errors::InvalidArgument("Channel dimension must be static, at ",
4832 node_def.name());
4833 }
4834 // Check parameter types
4835 auto parameter_type = inputs.at(1).weights().TrtDType();
4836 if ((parameter_type != nvinfer1::DataType::kFLOAT) &&
4837 (parameter_type != nvinfer1::DataType::kHALF)) {
4838 return errors::Unimplemented(
4839 "Only float32 or float16 weight data type is supported, for node ",
4840 node_def.name(), " got ", DebugString(parameter_type));
4841 }
4842 for (int i = 1; i < 5; i++) {
4843 if (inputs.at(i).weights().TrtDType() != parameter_type) {
4844 return errors::Unimplemented(
4845 "Inconsistent parameter type for batchnorm is not supported, at: " +
4846 node_def.name());
4847 }
4848 }
4849
4850 TRT_ShapedWeights dummy_power_weights(parameter_type);
4851 size_t nweight = 0;
4852 for (int i = 1; i < 5; i++) {
4853 nweight = std::max<size_t>(nweight, inputs.at(i).weights().count());
4854 }
4855 const TRT_ShapedWeights* ptr_shape_weights = nullptr;
4856 for (int i = 1; i < 5; i++) {
4857 if (inputs.at(i).weights().count() == nweight) {
4858 ptr_shape_weights = &(inputs.at(i).weights());
4859 } else if (inputs.at(i).weights().count() != 1) {
4860 return errors::InvalidArgument(
4861 "Inconsistent batchnorm parameter count, at: " + node_def.name());
4862 }
4863 }
4864 if (params->validation_only) return Status::OK();
4865
4866 // We could technically have two weights with different shape.
4867 // that requires two addScale op, arguably less performant
4868 TRT_ShapedWeights combined_scale_weights =
4869 params->weight_store->GetTempWeights(*ptr_shape_weights);
4870 TRT_ShapedWeights combined_offset_weights =
4871 params->weight_store->GetTempWeights(*ptr_shape_weights);
4872
4873 const Eigen::half* cast_vals_array[4];
4874 const float* vals_array[4];
4875 for (int j = 0; j < 4; j++) {
4876 cast_vals_array[j] =
4877 static_cast<Eigen::half const*>(inputs.at(j + 1).weights().GetValues());
4878 vals_array[j] =
4879 static_cast<float const*>(inputs.at(j + 1).weights().GetValues());
4880 }
4881 Eigen::half* cast_combined_scale_vals =
4882 static_cast<Eigen::half*>(combined_scale_weights.GetValues());
4883 Eigen::half* cast_combined_offset_vals =
4884 static_cast<Eigen::half*>(combined_offset_weights.GetValues());
4885 float* combined_scale_vals =
4886 static_cast<float*>(combined_scale_weights.GetValues());
4887 float* combined_offset_vals =
4888 static_cast<float*>(combined_offset_weights.GetValues());
4889
4890 for (size_t i = 0; i < nweight; ++i) {
4891 float batchnorm_data[4];
4892 for (int j = 0; j < 4; j++) {
4893 if (inputs.at(j + 1).weights().count() != 1) {
4894 if (parameter_type == nvinfer1::DataType::kFLOAT) {
4895 batchnorm_data[j] = vals_array[j][i];
4896 } else if (parameter_type == nvinfer1::DataType::kHALF) {
4897 batchnorm_data[j] = static_cast<float>(cast_vals_array[j][i]);
4898 }
4899 } else {
4900 if (parameter_type == nvinfer1::DataType::kFLOAT) {
4901 batchnorm_data[j] = vals_array[j][0];
4902 } else if (parameter_type == nvinfer1::DataType::kHALF) {
4903 batchnorm_data[j] = static_cast<float>(cast_vals_array[j][0]);
4904 }
4905 }
4906 }
4907 float scale = batchnorm_data[0];
4908 float offset = batchnorm_data[1];
4909 float mean = batchnorm_data[2];
4910 float variance = batchnorm_data[3];
4911 float combined_scale_val = scale / sqrtf(variance + epsilon);
4912 float combined_offset_val = offset - mean * combined_scale_val;
4913 if (parameter_type == nvinfer1::DataType::kFLOAT) {
4914 combined_scale_vals[i] = combined_scale_val;
4915 combined_offset_vals[i] = combined_offset_val;
4916 } else if (parameter_type == nvinfer1::DataType::kHALF) {
4917 cast_combined_scale_vals[i] = Eigen::half(combined_scale_val);
4918 cast_combined_offset_vals[i] = Eigen::half(combined_offset_val);
4919 }
4920 }
4921
4922 nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
4923 : nvinfer1::ScaleMode::kCHANNEL;
4924 nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
4925 *tensor->trt_tensor(), mode, combined_offset_weights.GetTrtWeights(),
4926 combined_scale_weights.GetTrtWeights(),
4927 dummy_power_weights.GetTrtWeights());
4928 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4929 params->converter->SetLayerName(layer, node_def);
4930 ITensorProxyPtr output_tensor = layer->getOutput(0);
4931 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4932 return Status::OK();
4933 }
4934
ConvertGather(OpConverterParams * params)4935 Status ConvertGather(OpConverterParams* params) {
4936 const auto& inputs = params->inputs;
4937 const auto& node_def = params->node_def;
4938 // TODO(tmorris): Use CheckInputsWeights by changing bool to enum with an
4939 // option for an input to be either tensor or weight.
4940 TF_RETURN_IF_ERROR(
4941 CheckInputsWeights(*params, {{"params", TrtInputArg::kBoth},
4942 {"indices", TrtInputArg::kTensor},
4943 {"axis", TrtInputArg::kWeight}}));
4944
4945 const auto& params_input = inputs.at(0);
4946 const auto& indices_input = inputs.at(1);
4947 const auto& axis_input = inputs.at(2);
4948
4949 TF_RETURN_IF_ERROR(AllowDataTypes(
4950 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32},
4951 /*dtype_attr_name=*/"Tparams"));
4952 TF_RETURN_IF_ERROR(AllowDataTypes(*params, {DataType::DT_INT32},
4953 /*dtype_attr_name=*/"Tindices"));
4954
4955 absl::Span<const int> axis = axis_input.weights().GetSpan<int>();
4956 if (axis.size() != 1) {
4957 return errors::InvalidArgument("Axis for GatherV2 must be a scalar, at ",
4958 node_def.name());
4959 }
4960 int trt_axis = 0;
4961 TF_RETURN_IF_ERROR(ConvertAxis(
4962 axis[0], params_input.GetTrtDims().nbDims, node_def.name(),
4963 params->use_implicit_batch && params_input.is_tensor(), &trt_axis));
4964 if (params->use_implicit_batch && params_input.is_weights() &&
4965 trt_axis != 0) {
4966 return errors::Unimplemented(
4967 "The input axis must be zero when params is a weight.");
4968 }
4969 if (params->use_implicit_batch && params_input.is_tensor() &&
4970 indices_input.batch_size() != 1) {
4971 return errors::Unimplemented(
4972 "Indices must have a batch size of 1 when params is a tensor.");
4973 }
4974 // Both input are tensors, and the TF gather result will have rank:
4975 // (params.nbDims + 1) + (indices.nbDims + 1) - 1,
4976 // where "+ 1" adds the batch dim. If params is a weight, the TRT rank matches
4977 // the TF rank so we don't have to add + 1.
4978 const int params_tf_rank =
4979 params_input.GetTrtDims().nbDims +
4980 (params->use_implicit_batch && params_input.is_tensor() ? 1 : 0);
4981 const int indices_tf_rank =
4982 indices_input.GetTrtDims().nbDims + (params->use_implicit_batch ? 1 : 0);
4983 const int tf_gather_output_rank = params_tf_rank + indices_tf_rank - 1;
4984 if (tf_gather_output_rank >
4985 nvinfer1::Dims::MAX_DIMS + (params->use_implicit_batch ? 1 : 0)) {
4986 return errors::InvalidArgument(
4987 "Result of gather has dimension greater than ",
4988 nvinfer1::Dims::MAX_DIMS + 1);
4989 }
4990 if (params->validation_only) return Status::OK();
4991
4992 // Convert params to tensor is it is a weight.
4993 ITensorProxyPtr params_tensor = nullptr;
4994 if (params_input.is_weights()) {
4995 params_tensor = params->converter->CreateConstantLayer(
4996 params_input.weights(), params_input.GetTrtDims());
4997 } else {
4998 params_tensor = params_input.tensor();
4999 }
5000
5001 // Note on how IGatherLayer works: if both the data and indices tensors have
5002 // a batch size dimension of size N, it performs:
5003 // for batchid in xrange(N):
5004 // output[batchid, a0, ..., an, i, ..., j, b0, ..., bn] = (
5005 // data[batchid, a0, ..., an, indices[batchid, i, ..., j] b0, ..., bn])
5006 nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
5007 *params_tensor->trt_tensor(), *indices_input.tensor()->trt_tensor(),
5008 trt_axis);
5009 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5010 params->converter->SetLayerName(layer, node_def);
5011
5012 ITensorProxyPtr output_tensor = layer->getOutput(0);
5013 nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions();
5014 // Note for the "- 2": one is for the output batch dim encapsulated by TF-TRT,
5015 // and the other is for the output dimension that is squeezed by IGatherLayer
5016 // because of the implicit batch dim in the indices (see the above note).
5017 const int expected_trt_output_rank =
5018 tf_gather_output_rank - (params_input.is_tensor() ? 2 : 1);
5019 if (params->use_implicit_batch &&
5020 trt_gather_output_dims.nbDims != expected_trt_output_rank) {
5021 return errors::Internal(
5022 "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ",
5023 expected_trt_output_rank,
5024 ", actual nbDims: ", trt_gather_output_dims.nbDims);
5025 }
5026 // Reshape the output so after adding the implicit batch dim it'll match the
5027 // output shape of TF GatherV2.
5028 if (params->use_implicit_batch && params_input.is_tensor()) {
5029 for (int i = trt_gather_output_dims.nbDims; i > trt_axis; --i) {
5030 trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i - 1];
5031 }
5032 trt_gather_output_dims.d[trt_axis] = 1;
5033 ++trt_gather_output_dims.nbDims;
5034
5035 TF_RETURN_IF_ERROR(PrepareTensorForShape(
5036 params->converter, TRT_TensorOrWeights(output_tensor),
5037 trt_gather_output_dims,
5038 /*validation_only=*/false, &output_tensor, node_def));
5039 }
5040
5041 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5042 return Status::OK();
5043 }
5044
5045 // Converts the input matrix multiplication node to a fully connected (FC) layer
5046 // if possible, as the FC layer has more tactics and INT implementations.
5047 // Returns the output ITensor* if the node is converted or nullptr if conversion
5048 // is not possible. An error status indicates internal problems during
5049 // conversion.
ConvertFullyConnectedImpl(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5050 StatusOr<ITensorProxyPtr> ConvertFullyConnectedImpl(OpConverterParams* params,
5051 TRT_TensorOrWeights input_a,
5052 TRT_TensorOrWeights input_b,
5053 bool transpose_a,
5054 bool transpose_b) {
5055 if (!(!transpose_a && input_a.is_tensor() && input_b.is_weights())) {
5056 VLOG(2) << "Not FC compatible, A must be non transposed tensor, and B "
5057 "must be constant.";
5058 return ITensorProxyPtr(nullptr);
5059 }
5060
5061 if (!params->use_implicit_batch && input_b.GetTrtDims().nbDims > 2 &&
5062 input_b.GetTrtDims().d[0] != 1) {
5063 // Implicit broadcasting, if needed, has already been considered to
5064 // transform the inputs and ensure the two operands have the same rank here.
5065 // If the inputs have rank >= 3, then d[0] is the explicit batch dimension.
5066 // The weight (input_b) must have batch size 1 in implicit batch mode.
5067 VLOG(2) << "Not FC compatible, if B has an explicit batch dimension, then "
5068 "it must be 1.";
5069 return ITensorProxyPtr(nullptr);
5070 }
5071
5072 nvinfer1::Dims input_dim = input_a.GetTrtDims();
5073 if (input_dim.d[input_dim.nbDims - 1] == -1) {
5074 VLOG(2) << "Not FC compatible, last dim of A must be static.";
5075 return ITensorProxyPtr(nullptr);
5076 }
5077
5078 if (input_dim.nbDims + 2 > nvinfer1::Dims::MAX_DIMS) {
5079 VLOG(2) << "Not FC compatible, cannot expand A's shape.";
5080 return ITensorProxyPtr(nullptr);
5081 }
5082
5083 // Add two trailing 1's because FC layer combines the last three dims.
5084 ITensorProxyPtr tensor_a = nullptr;
5085 nvinfer1::Dims reshape_dim{input_dim.nbDims + 2, {}};
5086 // The empty braces initialize the elements of reshap_dim.d to 0. A value 0 in
5087 // reshape_dim.d[i] will preserve the i-th dimension value from the shape of
5088 // input_a.
5089 reshape_dim.d[input_dim.nbDims] = 1;
5090 reshape_dim.d[input_dim.nbDims + 1] = 1;
5091 const NodeDef& node_def = params->node_def;
5092 TF_RETURN_IF_ERROR(PrepareTensorForShape(
5093 params->converter, input_a, reshape_dim,
5094 /*validation_only=*/false, &tensor_a, node_def, /*op_instance=*/0,
5095 /*origin_node_name=*/"FULLY_CONNECTED"));
5096
5097 VLOG(2) << "New shape of A " << DebugString(tensor_a->getDimensions());
5098
5099 TRT_ShapedWeights weights_b = input_b.weights();
5100 TRT_ShapedWeights weights_2D(weights_b);
5101 if (weights_b.shape_.nbDims > 2) {
5102 // Combine first nbDims-1 dims into a single dim, e.g. for a 4D tensor we
5103 // transform [N, H, W, C] -> [N*H*W, C]. This is only valid if all batch
5104 // dimensions are 1.
5105 if (std::any_of(weights_b.shape_.d,
5106 weights_b.shape_.d + weights_b.shape_.nbDims - 2,
5107 [](int d) { return d != 1; })) {
5108 VLOG(2) << "Not FC compatible, B has a batch dim larger than 1";
5109 return ITensorProxyPtr(nullptr);
5110 }
5111 int k = weights_b.shape_.d[weights_b.shape_.nbDims - 1];
5112 nvinfer1::Dims dims{2, {static_cast<int>(weights_b.count() / k), k}};
5113 TF_RETURN_IF_ERROR(weights_2D.SetShape(dims));
5114 }
5115
5116 // FC layer will transpose weights, so we need to pre-transpose.
5117 TRT_ShapedWeights weights(weights_2D.TrtDType());
5118 if (!transpose_b) {
5119 weights = params->weight_store->GetTempWeights(weights_2D);
5120 ReorderCKtoKC(weights_2D, &weights);
5121 } else {
5122 weights = weights_2D;
5123 }
5124 TRT_ShapedWeights biases(weights.TrtDType());
5125 int k = weights.shape_.d[weights.shape_.nbDims - 1];
5126 const int noutput = weights.count() / k;
5127 VLOG(2) << "Using fully connected layer with k=" << k
5128 << ", n_output=" << noutput
5129 << " weights shape: " << DebugString(weights.shape_) << " to convert "
5130 << node_def.op();
5131 nvinfer1::IFullyConnectedLayer* layer =
5132 params->converter->network()->addFullyConnected(
5133 *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(),
5134 biases.GetTrtWeights());
5135
5136 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5137 params->converter->SetLayerName(layer, node_def);
5138 ITensorProxyPtr output_tensor = layer->getOutput(0);
5139
5140 // A fully connected layer produces output with two trailing singleton
5141 // dimensions. We remove these.
5142 auto output_dim = output_tensor->getDimensions();
5143 output_dim.nbDims -= 2;
5144 // A zero in output_dim indicates copying the corresponding input dimension
5145 // value during reshape.
5146 std::fill(output_dim.d, output_dim.d + output_dim.nbDims, 0);
5147 TF_RETURN_IF_ERROR(PrepareTensorForShape(
5148 params->converter, TRT_TensorOrWeights(output_tensor), output_dim,
5149 /*validation_only=*/false, &output_tensor, node_def,
5150 /*op_instance=*/1, /*origin_node_name=*/"FULLY_CONNECTED"));
5151 return output_tensor;
5152 }
5153
ConvertMatMulImpl(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5154 StatusOr<ITensorProxyPtr> ConvertMatMulImpl(OpConverterParams* params,
5155 TRT_TensorOrWeights input_a,
5156 TRT_TensorOrWeights input_b,
5157 bool transpose_a,
5158 bool transpose_b) {
5159 if (params->use_implicit_batch) {
5160 // In implicit batch mode we are very limited when can we multiply 2D
5161 // matrices. If input_A is a 2D tensor, then nbDims==1 (implicit batch dim
5162 // not counted). If A is not transposed and B is weight, then we can convert
5163 // this treating A as a batch of vectors. This is the only possibility
5164 // to implement MatMul with 2D input in implicit batch mode.
5165 if ((input_a.GetTrtDims().nbDims < 2 &&
5166 (transpose_a || !input_b.is_weights())) ||
5167 (input_b.GetTrtDims().nbDims < 2)) {
5168 return errors::InvalidArgument(
5169 "MatMul with 2D tensors requires explicit batch mode, or that tensor"
5170 " A is not transposed and B is a constant tensor.");
5171 }
5172 }
5173
5174 if (params->validation_only) return ITensorProxyPtr(nullptr);
5175
5176 StatusOr<ITensorProxyPtr> result = ConvertFullyConnectedImpl(
5177 params, input_a, input_b, transpose_a, transpose_b);
5178 TF_RETURN_IF_ERROR(result.status());
5179 ITensorProxyPtr output = result.ValueOrDie();
5180 if (*output) {
5181 // FC conversion was successful, we can return.
5182 return output;
5183 }
5184 const auto convert_to_itensor =
5185 [¶ms](TRT_TensorOrWeights operand) -> ITensorProxyPtr {
5186 if (operand.is_tensor()) {
5187 return operand.tensor();
5188 } else {
5189 return params->converter->CreateConstantLayer(operand.weights(),
5190 operand.GetTrtDims());
5191 }
5192 };
5193
5194 ITensorProxyPtr tensor_a = convert_to_itensor(input_a);
5195 ITensorProxyPtr tensor_b = convert_to_itensor(input_b);
5196
5197 const auto get_matrix_op = [](ITensorProxyPtr in,
5198 bool transpose) -> nvinfer1::MatrixOperation {
5199 return (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
5200 : nvinfer1::MatrixOperation::kNONE;
5201 };
5202 nvinfer1::MatrixOperation op_a, op_b;
5203 // Note: In implicit batch mode kTRANSPOSE and kNONE are only valid if the
5204 // matrix has at least 2 non-batch dimension. In implicit batch mode, if a has
5205 // 1 dim (excluding batch dim), then we can only use kVECTOR, which will treat
5206 // matrix A as a batch of vectors.
5207 op_a = (tensor_a->getDimensions().nbDims < 2)
5208 ? nvinfer1::MatrixOperation::kVECTOR
5209 : get_matrix_op(tensor_a, transpose_a);
5210 // In implicit batch mode, if B has only 1 dims (excluding batch dim) then we
5211 // already reject the case and don't convert. One could consider using the
5212 // kVECTOR flag to express C = MatMul(A, B.T) if A is weight, but the result
5213 // will not have the correct shape: in TRT's implicit batch implementation,
5214 // the result is a batch of vectors D_ji = A_ik * B_jk, where j is the batch
5215 // dimension. In contrast, the TF MatMul op produces C = D.T, and we cannot
5216 // transpose over the batch dimension (implicit batch mode).
5217 op_b = get_matrix_op(tensor_b, transpose_b);
5218
5219 nvinfer1::IMatrixMultiplyLayer* layer =
5220 params->converter->network()->addMatrixMultiply(
5221 *tensor_a->trt_tensor(), op_a, *tensor_b->trt_tensor(), op_b);
5222
5223 const auto& node_def = params->node_def;
5224 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5225 params->converter->SetLayerName(layer, node_def);
5226 return ITensorProxyPtr(layer->getOutput(0));
5227 }
5228
ConvertMatMulHelper(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5229 Status ConvertMatMulHelper(OpConverterParams* params,
5230 TRT_TensorOrWeights input_a,
5231 TRT_TensorOrWeights input_b, bool transpose_a,
5232 bool transpose_b) {
5233 StatusOr<ITensorProxyPtr> result =
5234 ConvertMatMulImpl(params, input_a, input_b, transpose_a, transpose_b);
5235 TF_RETURN_IF_ERROR(result.status());
5236 if (!params->validation_only) {
5237 params->outputs->push_back(TRT_TensorOrWeights(result.ValueOrDie()));
5238 }
5239 return Status::OK();
5240 }
5241
5242 // inputs are both two dimensional (ops::MatMul)
ConvertMatMul(OpConverterParams * params)5243 Status ConvertMatMul(OpConverterParams* params) {
5244 const auto& inputs = params->inputs;
5245 const auto& node_def = params->node_def;
5246 if (inputs.size() != 2) {
5247 return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
5248 " inputs but expected 2, at ",
5249 node_def.name());
5250 }
5251 TF_RETURN_IF_ERROR(
5252 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5253
5254 TFAttrs attrs(node_def);
5255 bool transpose_a = attrs.get<bool>("transpose_a");
5256 bool transpose_b = attrs.get<bool>("transpose_b");
5257
5258 return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a,
5259 transpose_b);
5260 }
5261
ConvertBatchMatMul(OpConverterParams * params)5262 Status ConvertBatchMatMul(OpConverterParams* params) {
5263 const auto& inputs = params->inputs;
5264 const auto& node_def = params->node_def;
5265 if (inputs.size() != 2) {
5266 return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
5267 " inputs but expected 2, at ",
5268 node_def.name());
5269 }
5270 TF_RETURN_IF_ERROR(CheckInputsWeights(
5271 *params, {{"x", TrtInputArg::kBoth}, {"y", TrtInputArg::kBoth}}));
5272 // TODO(tfeher): Consider adding INT8 type because FC layer can support it.
5273 TF_RETURN_IF_ERROR(
5274 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5275 if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
5276 return errors::InvalidArgument(
5277 "All inputs are weights, but Grappler is expected to fold them.");
5278 }
5279
5280 TFAttrs attrs(node_def);
5281 const bool transpose_a = attrs.get<bool>("adj_x");
5282 const bool transpose_b = attrs.get<bool>("adj_y");
5283
5284 // In case input_l is weight, check whether input_l has implicit batch mode
5285 // compatible batch dim.
5286 const auto check_weight_is_not_batched =
5287 [](const TRT_TensorOrWeights& input_l,
5288 const TRT_TensorOrWeights& input_r) {
5289 // There is no way to batch constants in TRT using implicit batch mode.
5290 // Example:
5291 // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3
5292 // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6
5293 // It is not possible to treat the weight input as a batched [3, 6]
5294 // tensor. Batched weight tensors must have batch dim = 1 (after the
5295 // broadcast).
5296 if (input_l.is_weights() &&
5297 input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims &&
5298 input_l.GetTrtDims().d[0] != 1) {
5299 return errors::Unimplemented(
5300 "TensorRT does not support batched constants in implicit batch "
5301 "mode.");
5302 }
5303 return Status::OK();
5304 };
5305 if (params->use_implicit_batch) {
5306 TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1)));
5307 TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0)));
5308 }
5309
5310 // Broadcast inputs. We don't check feasibility since the dimensions in a
5311 // MatMul don't need to match. For example, consider a valid set of inputs
5312 // which would produce an output of shape [N, T, K]:
5313 // input 0: [N, T, C]
5314 // input 1: [1, C, K]
5315 // Since C != K and T != C, check feasiblity would fail.
5316 auto input_l = std::make_unique<TRT_TensorOrWeights>(inputs.at(0));
5317 auto input_r = std::make_unique<TRT_TensorOrWeights>(inputs.at(1));
5318 TF_RETURN_IF_ERROR(BroadcastTensors(input_l, input_r,
5319 /*check_feasibility=*/false, params));
5320
5321 if (params->validation_only) return Status::OK();
5322
5323 return ConvertMatMulHelper(params, *input_l, *input_r, transpose_a,
5324 transpose_b);
5325 }
5326
5327 // Finds the indices of elements in [begin, end) in array
5328 // [array_begin, array_end), and appends the indices to permute. This is used to
5329 // construct the permutation sequence for the operand with input labels
5330 // [array_begin, array_end) to the desired permuted labels [begin, end).
5331 template <typename Iterator>
FindIndices(Iterator begin,Iterator end,Iterator array_begin,Iterator array_end,std::vector<int> * permute)5332 Status FindIndices(Iterator begin, Iterator end, Iterator array_begin,
5333 Iterator array_end, std::vector<int>* permute) {
5334 const int n = array_end - array_begin;
5335 if (n < end - begin) {
5336 return errors::Internal("Incorrect array size");
5337 }
5338 for (auto i = begin; i < end; i++) {
5339 int idx = std::find(array_begin, array_end, *i) - array_begin;
5340 if (idx >= n) {
5341 return errors::Internal("Label not found");
5342 }
5343 permute->push_back(idx);
5344 }
5345 return Status::OK();
5346 }
5347
5348 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
5349 // Layout of the einsum dimensions: Batch, Free and Contraction indices.
5350 // Example: abcd,adef -> abde. The first tensor has layout BFC, the second BCF.
5351 enum class EinsumLayout { BFC, BCF, MIX };
5352
5353 // Describes an operand: input shape, number of batch, free and contract
5354 // dimensions, and the permutation that is needed to bring it to a matmul
5355 // compatible form.
5356 struct EinsumDescriptor {
EinsumDescriptortensorflow::tensorrt::convert::EinsumDescriptor5357 EinsumDescriptor() : b(0), f(0), c(0) {}
5358
5359 // Deduces the number of batch, free, contract dimensions from the input
5360 // labels, decides what layout to use, and determines permutation indices for
5361 // that layout.
InitDescriptortensorflow::tensorrt::convert::EinsumDescriptor5362 Status InitDescriptor(const TRT_TensorOrWeights& operand, Labels input_labels,
5363 std::vector<EinsumHelper::DimensionType>& label_types,
5364 EinsumLayout preferred_layout,
5365 EinsumDescriptor* other = nullptr) {
5366 if (preferred_layout == EinsumLayout::MIX)
5367 return errors::Internal("Preferred einsum layout cannot be MIX");
5368 const EinsumHelper::DimensionType kBatch =
5369 EinsumHelper::DimensionType::kBatch;
5370 const EinsumHelper::DimensionType kFree =
5371 EinsumHelper::DimensionType::kFree;
5372 const EinsumHelper::DimensionType kContract =
5373 EinsumHelper::DimensionType::kContract;
5374
5375 // Map label indices to label types.
5376 std::vector<EinsumHelper::DimensionType> types; // Input label types.
5377 std::transform(input_labels.begin(), input_labels.end(),
5378 std::back_inserter(types),
5379 [&label_types, kBatch](int i) { return label_types.at(i); });
5380
5381 using label_t_iterator = std::vector<EinsumHelper::DimensionType>::iterator;
5382 auto count_labels = [](label_t_iterator begin, label_t_iterator end,
5383 EinsumHelper::DimensionType val) {
5384 return std::count_if(begin, end, [val](EinsumHelper::DimensionType t) {
5385 return t == val;
5386 });
5387 };
5388
5389 b = count_labels(types.begin(), types.end(), kBatch);
5390 f = count_labels(types.begin(), types.end(), kFree);
5391 c = count_labels(types.begin(), types.end(), kContract);
5392
5393 if (c == 0 || f == 0) {
5394 VLOG(2) << "Einsum equation needs to have at least one free and one "
5395 "contract dimension";
5396 return errors::Unimplemented("No conversion for einsum equation.");
5397 }
5398
5399 // Checks whether input_labels[offset:offset+m] matches labels from other.
5400 auto order_matches = [other, &input_labels, kBatch, kFree, kContract](
5401 int offset, int m,
5402 EinsumHelper::DimensionType dim_type) {
5403 if (!other) return true;
5404 int offset_other = 0;
5405 if (dim_type == kFree)
5406 offset = other->offset_f;
5407 else if (dim_type == kContract)
5408 offset = other->offset_c;
5409 return std::equal(input_labels.begin() + offset,
5410 input_labels.begin() + offset + m,
5411 other->permuted_labels.begin() + offset_other);
5412 };
5413
5414 // Check if the current layout is BFC or BCF. In that case we could avoid
5415 // transpose.
5416 layout = EinsumLayout::MIX;
5417 if (count_labels(types.begin(), types.begin() + b, kBatch) == b &&
5418 order_matches(0, b, kBatch)) {
5419 // Batch dims are the leading dims. They have the same order as other.
5420 if (count_labels(types.begin() + b, types.begin() + b + f, kFree) == f) {
5421 // All the free dims are placed consecutively after the batch dims.
5422 // Their order is arbitrary. The final transpose will ensure that the
5423 // output has correct order. We still have to check that the contract
5424 // indices have correct order.
5425 if (order_matches(b + f, c, kContract)) {
5426 layout = EinsumLayout::BFC;
5427 }
5428 } else if (count_labels(types.begin() + b, types.begin() + b + c,
5429 kContract) == c) {
5430 // All the contract dims are placed consecutively after the batch
5431 // dims. Check whether the contract dims have the same order as the
5432 // contract dims in other.
5433 if (order_matches(b, c, kContract)) {
5434 layout = EinsumLayout::BCF;
5435 }
5436 }
5437 }
5438
5439 if (layout == EinsumLayout::MIX) {
5440 // Input label types are mixed. Calculate a permutation that maps them
5441 // to the preferred layout (BCF or BFC).
5442 layout = preferred_layout;
5443 if (!other) {
5444 AppendMatchingIndicesToPermute(types, kBatch);
5445 } else {
5446 TF_RETURN_IF_ERROR(
5447 FindIndices(other->permuted_labels.begin(),
5448 other->permuted_labels.begin() + other->b,
5449 input_labels.begin(), input_labels.end(), &permute));
5450 }
5451 if (layout == EinsumLayout::BFC) {
5452 AppendMatchingIndicesToPermute(types, kFree);
5453 if (!other) {
5454 AppendMatchingIndicesToPermute(types, kContract);
5455 } else {
5456 TF_RETURN_IF_ERROR(FindIndices(
5457 other->permuted_labels.begin() + other->offset_c,
5458 other->permuted_labels.begin() + other->offset_c + other->c,
5459 input_labels.begin(), input_labels.end(), &permute));
5460 }
5461 } else {
5462 if (!other) {
5463 AppendMatchingIndicesToPermute(types, kContract);
5464 } else {
5465 TF_RETURN_IF_ERROR(FindIndices(
5466 other->permuted_labels.begin() + other->offset_c,
5467 other->permuted_labels.begin() + other->offset_c + other->c,
5468 input_labels.begin(), input_labels.end(), &permute));
5469 }
5470 AppendMatchingIndicesToPermute(types, kFree);
5471 }
5472 }
5473
5474 if (layout == EinsumLayout::BFC) {
5475 offset_f = b;
5476 offset_c = f + b;
5477 } else {
5478 offset_f = b + c;
5479 offset_c = b;
5480 }
5481
5482 dims = operand.GetTrtDims();
5483 for (int i = 0; i < b; i++) {
5484 // Set unknown batch dims to zero. These dims will be used in reshape op,
5485 // where zero is a special value for retaining the original dim size.
5486 if (dims.d[i] == -1) dims.d[i] = 0;
5487 }
5488 permuted_labels = input_labels;
5489 if (!permute.empty()) {
5490 // Apply the permutation on the dimension array.
5491 nvinfer1::Dims orig_dims = dims;
5492 for (int i = 0; i < permute.size(); i++) {
5493 dims.d[i] = orig_dims.d[permute[i]];
5494 permuted_labels[i] = input_labels[permute[i]];
5495 }
5496 }
5497 size_tensors.resize(dims.nbDims, nullptr);
5498
5499 VLOG(2) << "Set up descriptor with "
5500 << (layout == EinsumLayout::BFC ? "BFC" : "BCF")
5501 << " layout, b=" << b << ", f=" << f << ", c=" << c;
5502 return Status::OK();
5503 }
5504
5505 // Appends indices where types maches value.
AppendMatchingIndicesToPermutetensorflow::tensorrt::convert::EinsumDescriptor5506 void AppendMatchingIndicesToPermute(
5507 const std::vector<EinsumHelper::DimensionType>& types,
5508 EinsumHelper::DimensionType val) {
5509 for (int i = 0; i < types.size(); i++) {
5510 if (types[i] == val) {
5511 permute.push_back(i);
5512 }
5513 }
5514 }
5515
5516 // Returns whether the free and contract dimension have static shape.
HasStaticShapetensorflow::tensorrt::convert::EinsumDescriptor5517 bool HasStaticShape() {
5518 return !std::any_of(dims.d + b, dims.d + dims.nbDims,
5519 [](int k) { return k == -1; });
5520 }
5521
GetPermutationtensorflow::tensorrt::convert::EinsumDescriptor5522 nvinfer1::Permutation GetPermutation() {
5523 nvinfer1::Permutation p;
5524 std::copy(permute.begin(), permute.end(), p.order);
5525 return p;
5526 }
5527
SetDynamicSizetensorflow::tensorrt::convert::EinsumDescriptor5528 Status SetDynamicSize(OpConverterParams* params,
5529 const TRT_TensorOrWeights& operand) {
5530 if (operand.GetTrtDims().nbDims != dims.nbDims)
5531 return errors::Internal("Operand dims must agree with descirptor dims");
5532
5533 if (operand.is_weights()) {
5534 for (int i = 0; i < operand.GetTrtDims().nbDims; i++) {
5535 // dims.d stores the permuted dims.
5536 TF_RETURN_IF_ERROR(
5537 CreateScalarConstant(params, dims.d[i], &size_tensors[i]));
5538 }
5539 return Status::OK();
5540 }
5541 auto* shape_layer =
5542 params->converter->network()->addShape(*operand.tensor()->trt_tensor());
5543 TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
5544 ITensorProxyPtr shape = shape_layer->getOutput(0);
5545 for (int i = 0; i < operand.GetTrtDims().nbDims; i++) {
5546 int idx = permute.empty() ? i : permute.at(i);
5547 auto* layer = params->converter->network()->addSlice(
5548 *shape->trt_tensor(), {1, {idx}}, {1, {1}}, {1, {1}});
5549 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5550 size_tensors[i] = layer->getOutput(0);
5551 TFTRT_RETURN_ERROR_IF_NULLPTR(size_tensors[i], "error, slice is nullptr");
5552 }
5553 return Status::OK();
5554 }
5555
5556 EinsumLayout layout;
5557 int b; // number of batch dims
5558 int f; // number of free dims
5559 int c; // number of conraction dims
5560 int offset_f;
5561 int offset_c;
5562 nvinfer1::Dims dims;
5563 std::vector<int> permute;
5564 std::vector<ITensorProxyPtr> size_tensors;
5565 Labels permuted_labels;
5566 };
5567
GetDimsProd(nvinfer1::Dims dims,int offset,int n,int32_t * out)5568 Status GetDimsProd(nvinfer1::Dims dims, int offset, int n, int32_t* out) {
5569 size_t prod = std::accumulate(dims.d + offset, dims.d + offset + n, size_t(1),
5570 std::multiplies<size_t>());
5571 if (prod > std::numeric_limits<int32_t>::max()) {
5572 return errors::Internal("Matrix too large");
5573 } else {
5574 *out = prod;
5575 }
5576 return Status::OK();
5577 }
5578
GetDimsProdDynamic(OpConverterParams * params,std::vector<ITensorProxyPtr>::const_iterator begin,std::vector<ITensorProxyPtr>::const_iterator end,ITensorProxyPtr * out)5579 Status GetDimsProdDynamic(OpConverterParams* params,
5580 std::vector<ITensorProxyPtr>::const_iterator begin,
5581 std::vector<ITensorProxyPtr>::const_iterator end,
5582 ITensorProxyPtr* out) {
5583 *out = *begin;
5584 begin++;
5585 while (begin != end) {
5586 nvinfer1::IElementWiseLayer* layer =
5587 params->converter->network()->addElementWise(
5588 *(*out)->trt_tensor(), *(*begin)->trt_tensor(),
5589 nvinfer1::ElementWiseOperation::kPROD);
5590 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5591 *out = layer->getOutput(0);
5592 begin++;
5593 }
5594 return Status::OK();
5595 }
5596
ConcatenateShape(OpConverterParams * params,const std::vector<ITensorProxyPtr> size_tensors,ITensorProxyPtr * new_shape)5597 Status ConcatenateShape(OpConverterParams* params,
5598 const std::vector<ITensorProxyPtr> size_tensors,
5599 ITensorProxyPtr* new_shape) {
5600 std::vector<nvinfer1::ITensor*> trt_size_tensors;
5601 for (const auto& t : size_tensors) {
5602 trt_size_tensors.push_back(t->trt_tensor());
5603 }
5604 nvinfer1::IConcatenationLayer* layer =
5605 params->converter->network()->addConcatenation(
5606 static_cast<nvinfer1::ITensor* const*>(trt_size_tensors.data()),
5607 size_tensors.size());
5608 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5609 layer->setAxis(0);
5610 *new_shape = layer->getOutput(0);
5611 return Status::OK();
5612 }
5613
5614 // Reshapes operand so that the free dimensions are combined into a single dim,
5615 // and the contract dimensions are combined into another single dim.
GetEinsumNewDynamicShape(OpConverterParams * params,const EinsumDescriptor & desc,ITensorProxyPtr * new_shape)5616 Status GetEinsumNewDynamicShape(OpConverterParams* params,
5617 const EinsumDescriptor& desc,
5618 ITensorProxyPtr* new_shape) {
5619 std::vector<ITensorProxyPtr> size(desc.size_tensors.begin(),
5620 desc.size_tensors.begin() + desc.b + 2);
5621
5622 int idx_f = desc.layout == EinsumLayout::BFC ? desc.b : desc.b + 1;
5623 int idx_c = desc.layout == EinsumLayout::BFC ? desc.b + 1 : desc.b;
5624
5625 TF_RETURN_IF_ERROR(GetDimsProdDynamic(
5626 params, desc.size_tensors.begin() + desc.offset_f,
5627 desc.size_tensors.begin() + desc.offset_f + desc.f, &size[idx_f]));
5628
5629 TF_RETURN_IF_ERROR(GetDimsProdDynamic(
5630 params, desc.size_tensors.begin() + desc.offset_c,
5631 desc.size_tensors.begin() + desc.offset_c + desc.c, &size[idx_c]));
5632
5633 TF_RETURN_IF_ERROR(ConcatenateShape(params, size, new_shape));
5634 return Status::OK();
5635 }
5636
5637 // Reshapes operand so that the free dimensions are combined into a single dim,
5638 // and the contract dimensions are combined into another single dim.
GetEinsumNewStaticShape(const EinsumDescriptor & desc,nvinfer1::Dims * new_dims)5639 Status GetEinsumNewStaticShape(const EinsumDescriptor& desc,
5640 nvinfer1::Dims* new_dims) {
5641 new_dims->nbDims = desc.b + 2;
5642 // Copy batch dims.
5643 std::copy(desc.dims.d, desc.dims.d + desc.b, new_dims->d);
5644 // Combine free dims and contract dims.
5645 int idx_f = desc.layout == EinsumLayout::BFC ? desc.b : desc.b + 1;
5646 int idx_c = desc.layout == EinsumLayout::BFC ? desc.b + 1 : desc.b;
5647 TF_RETURN_IF_ERROR(
5648 GetDimsProd(desc.dims, desc.offset_f, desc.f, new_dims->d + idx_f));
5649 TF_RETURN_IF_ERROR(
5650 GetDimsProd(desc.dims, desc.offset_c, desc.c, new_dims->d + idx_c));
5651 return Status::OK();
5652 }
5653
5654 // Adds shuffle layer (if needed) to bring einsum operand to a matmul compatible
5655 // format.
ShuffleEinsumTensor(OpConverterParams * params,std::unique_ptr<TRT_TensorOrWeights> * operand,EinsumDescriptor * desc,int op_instance)5656 Status ShuffleEinsumTensor(OpConverterParams* params,
5657 std::unique_ptr<TRT_TensorOrWeights>* operand,
5658 EinsumDescriptor* desc, int op_instance) {
5659 if (params->validation_only) return Status::OK();
5660 TF_RETURN_IF_ERROR(desc->SetDynamicSize(params, **operand));
5661 bool need_reshape = (desc->f != 1 || desc->c != 1);
5662 bool need_transpose = !desc->permute.empty();
5663 if ((*operand)->is_weights()) {
5664 nvinfer1::Dims new_dims;
5665 TF_RETURN_IF_ERROR(GetEinsumNewStaticShape(*desc, &new_dims));
5666 if (!need_transpose) {
5667 TRT_ShapedWeights weights((*operand)->weights());
5668 TF_RETURN_IF_ERROR(weights.SetShape(new_dims));
5669 operand->reset(new TRT_TensorOrWeights(weights));
5670 return Status::OK();
5671 }
5672 // TODO(tfeher): Instead of creating a tensor that will be transposed,
5673 // transpose the weight itself. Keeping it weight could enable FC layer.
5674 ITensorProxyPtr tensor = params->converter->CreateConstantLayer(
5675 (*operand)->weights(), (*operand)->GetTrtDims());
5676 operand->reset(new TRT_TensorOrWeights(tensor));
5677 }
5678
5679 if (!need_transpose && !need_reshape) return Status::OK();
5680 ITensorProxyPtr operand_tensor = (*operand)->tensor();
5681 TFTRT_RETURN_ERROR_IF_NULLPTR(operand_tensor, "Null tensor at Einsum");
5682 nvinfer1::IShuffleLayer* layer =
5683 params->converter->network()->addShuffle(*operand_tensor->trt_tensor());
5684
5685 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5686 params->converter->SetLayerName(layer, params->node_def, "shuffle",
5687 /*op_instance=*/op_instance);
5688 // Set new shape.
5689 if (need_reshape) {
5690 if (desc->HasStaticShape()) {
5691 nvinfer1::Dims new_dims;
5692 TF_RETURN_IF_ERROR(GetEinsumNewStaticShape(*desc, &new_dims));
5693 layer->setReshapeDimensions(new_dims);
5694 } else {
5695 ITensorProxyPtr new_shape;
5696 TF_RETURN_IF_ERROR(GetEinsumNewDynamicShape(params, *desc, &new_shape));
5697 layer->setInput(1, *new_shape->trt_tensor());
5698 }
5699 }
5700
5701 if (need_transpose) {
5702 layer->setFirstTranspose(desc->GetPermutation());
5703 }
5704 operand->reset(new TRT_TensorOrWeights(layer->getOutput(0)));
5705 return Status::OK();
5706 }
5707
5708 // Combines output dims/labels by copying batch and free dims/labels from input
5709 // A, and concatenating free values from input B.
5710 template <typename InputIterator, typename OutputIterator>
AssembleOutput(InputIterator begin_a,InputIterator begin_b,const EinsumDescriptor & desc_a,const EinsumDescriptor & desc_b,OutputIterator out)5711 void AssembleOutput(InputIterator begin_a, InputIterator begin_b,
5712 const EinsumDescriptor& desc_a,
5713 const EinsumDescriptor& desc_b, OutputIterator out) {
5714 std::copy(begin_a, begin_a + desc_a.b, out);
5715 begin_a += desc_a.offset_f;
5716 std::copy(begin_a, begin_a + desc_a.f, out + desc_a.b);
5717 begin_b += desc_b.offset_f;
5718 std::copy(begin_b, begin_b + desc_b.f, out + desc_a.b + desc_a.f);
5719 }
5720
5721 // Restores free dimensions and sets final index order. Consider C = A * B,
5722 // batched MatMul op, where A.shape = [B, x, k] and B.shape = [B, k, y]. Then
5723 // C.shape = [B, x, y]. Here B can denote multiple batch indices while x, y, k
5724 // are single indices. The original inputs to Einsum can have multiple free
5725 // indices. These were combined into a singe free dimension x and y, for example
5726 // x = f_a1 * f_a2 * f_a3, y = f_b1 * f_b2. This routine creates a shuffle layer
5727 // to expand x into and y the original free dims, e.g. C is reshaped to
5728 // [B, f_a1, f_a2, f_a3, f_b1, f_b2]. Finally, a permutation is applied to
5729 // transform the shape to the shape of the original Einsum output.
ShuffleEinsumOutput(OpConverterParams * params,EinsumDescriptor desc_a,EinsumDescriptor desc_b,const std::vector<int> & permutation,ITensorProxyPtr * output)5730 Status ShuffleEinsumOutput(OpConverterParams* params, EinsumDescriptor desc_a,
5731 EinsumDescriptor desc_b,
5732 const std::vector<int>& permutation,
5733 ITensorProxyPtr* output) {
5734 if (permutation.empty() && (desc_a.f == 1 && desc_b.f == 1))
5735 return Status::OK();
5736
5737 nvinfer1::IShuffleLayer* layer =
5738 params->converter->network()->addShuffle(*(*output)->trt_tensor());
5739 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5740 params->converter->SetLayerName(layer, params->node_def, "shuffle",
5741 /*op_instance=*/2);
5742
5743 int output_rank = desc_a.b + desc_a.f + desc_b.f;
5744 if (desc_a.f != 1 || desc_b.f != 1) {
5745 if (desc_a.HasStaticShape() && desc_b.HasStaticShape()) {
5746 nvinfer1::Dims dims_out = {output_rank, {}};
5747 AssembleOutput(desc_a.dims.d, desc_b.dims.d, desc_a, desc_b, dims_out.d);
5748 layer->setReshapeDimensions(dims_out);
5749 } else {
5750 std::vector<ITensorProxyPtr> size_tensors(output_rank);
5751 AssembleOutput(desc_a.size_tensors.begin(), desc_b.size_tensors.begin(),
5752 desc_a, desc_b, size_tensors.begin());
5753 ITensorProxyPtr new_shape;
5754 TF_RETURN_IF_ERROR(ConcatenateShape(params, size_tensors, &new_shape));
5755 layer->setInput(1, *new_shape->trt_tensor());
5756 }
5757 }
5758
5759 if (!permutation.empty()) {
5760 nvinfer1::Permutation p;
5761 std::copy(permutation.begin(), permutation.end(), p.order);
5762 layer->setSecondTranspose(p);
5763 }
5764 *output = layer->getOutput(0);
5765 return Status::OK();
5766 }
5767
5768 // Prepares EinsumDescriptors after parsing the equation and determines the
5769 // final transpose.
ParseEquation(OpConverterParams * params,std::unique_ptr<TRT_TensorOrWeights> * input_a,std::unique_ptr<TRT_TensorOrWeights> * input_b,EinsumDescriptor * descriptor_a,EinsumDescriptor * descriptor_b,std::vector<int> * final_transpose)5770 Status ParseEquation(OpConverterParams* params,
5771 std::unique_ptr<TRT_TensorOrWeights>* input_a,
5772 std::unique_ptr<TRT_TensorOrWeights>* input_b,
5773 EinsumDescriptor* descriptor_a,
5774 EinsumDescriptor* descriptor_b,
5775 std::vector<int>* final_transpose) {
5776 TFAttrs attrs(params->node_def);
5777 std::string equation = attrs.get<string>("equation");
5778 VLOG(2) << "Einsum equation " << equation;
5779
5780 OperandLabels input_labels;
5781 Labels output_labels;
5782 std::vector<EinsumHelper::DimensionType> label_types;
5783 OperandLabelCounts input_label_counts;
5784 LabelCounts output_label_counts;
5785 absl::InlinedVector<bool, 2> input_has_ellipsis;
5786 bool output_has_ellipsis;
5787 TF_RETURN_IF_ERROR(EinsumHelper::ParseEquation(
5788 equation, &input_labels, &output_labels, &label_types,
5789 &input_label_counts, &output_label_counts, &input_has_ellipsis,
5790 &output_has_ellipsis));
5791
5792 VLOG(2) << "Output has ellipsis: " << output_has_ellipsis;
5793
5794 if (input_has_ellipsis[0] || input_has_ellipsis[1] || output_has_ellipsis) {
5795 // TODO(tfeher): Handle ellipsis like EinsumHelper::ProcessDimensions.
5796 // Note: ProcessDimensions would introduce kBroadcasting labels, which we
5797 // need to replace with kBatch before we call InitDescriptor.
5798 VLOG(2) << "Ellipsis not yet supported";
5799 return errors::Unimplemented("No conversion for einsum equation.");
5800 }
5801 if (absl::c_any_of(label_types, [](auto l) {
5802 return l == EinsumHelper::DimensionType::kReduce ||
5803 l == EinsumHelper::DimensionType::kBroadcasting;
5804 })) {
5805 VLOG(2) << "Einsum reductions not implemented";
5806 return errors::Unimplemented("No conversion for einsum equation.");
5807 }
5808
5809 auto no_duplicated_labels = [](const LabelCounts& label_counts) {
5810 return absl::c_any_of(label_counts, [](int i) { return i > 1; });
5811 };
5812 if (no_duplicated_labels(input_label_counts[0]) ||
5813 no_duplicated_labels(input_label_counts[1]) ||
5814 no_duplicated_labels(output_label_counts)) {
5815 VLOG(2) << "Einsum invalid label count";
5816 return errors::Unimplemented("No conversion for einsum equation.");
5817 }
5818
5819 if ((*input_a)->is_weights() && (*input_b)->is_tensor()) {
5820 // We prefer to use FC layer, needs A as tensor and B as weight.
5821 std::swap(*input_a, *input_b);
5822 std::swap(input_labels[0], input_labels[1]);
5823 std::swap(input_label_counts[0], input_label_counts[1]);
5824 }
5825
5826 TF_RETURN_IF_ERROR(descriptor_a->InitDescriptor(
5827 **input_a, input_labels[0], label_types, EinsumLayout::BFC));
5828 TF_RETURN_IF_ERROR(
5829 descriptor_b->InitDescriptor(**input_b, input_labels[1], label_types,
5830 EinsumLayout::BCF, descriptor_a));
5831 // TODO(tfeher): Update the permutation in the descriptors to avoid final
5832 // transpose (if possible). Consider swapping the input if it eliminates
5833 // final transpose.
5834
5835 // Get final transpose.
5836 Labels matmul_output_labels(descriptor_a->b + descriptor_a->f +
5837 descriptor_b->f);
5838 AssembleOutput(descriptor_a->permuted_labels.begin(),
5839 descriptor_b->permuted_labels.begin(), *descriptor_a,
5840 *descriptor_b, matmul_output_labels.begin());
5841 TF_RETURN_IF_ERROR(FindIndices(output_labels.begin(), output_labels.end(),
5842 matmul_output_labels.begin(),
5843 matmul_output_labels.end(), final_transpose));
5844 // Clear identity transpose.
5845 bool identity_transpose = true;
5846 for (int i = 0; i < final_transpose->size() && identity_transpose; i++) {
5847 identity_transpose &= final_transpose->at(i) == i;
5848 }
5849 if (identity_transpose) {
5850 final_transpose->clear();
5851 }
5852 return Status::OK();
5853 }
5854
ConvertEinsum(OpConverterParams * params)5855 Status ConvertEinsum(OpConverterParams* params) {
5856 const auto& inputs = params->inputs;
5857 const auto& node_def = params->node_def;
5858 if (params->use_implicit_batch) {
5859 return errors::Unimplemented(
5860 "Einsum converter requires dynamic shape mode");
5861 }
5862
5863 if (inputs.size() != 2) {
5864 VLOG(2) << "Einsum converter supports two operands at " << node_def.name()
5865 << " got " << inputs.size();
5866 return errors::Unimplemented("No conversion for einsum equation.");
5867 }
5868 TF_RETURN_IF_ERROR(
5869 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5870
5871 auto input_a = std::make_unique<TRT_TensorOrWeights>(inputs.at(0));
5872 auto input_b = std::make_unique<TRT_TensorOrWeights>(inputs.at(1));
5873 EinsumDescriptor descriptor_a;
5874 EinsumDescriptor descriptor_b;
5875 std::vector<int> final_transpose;
5876 TF_RETURN_IF_ERROR(ParseEquation(params, &input_a, &input_b, &descriptor_a,
5877 &descriptor_b, &final_transpose));
5878
5879 TF_RETURN_IF_ERROR(ShuffleEinsumTensor(params, &input_a, &descriptor_a,
5880 /*op_instance=*/0));
5881 TF_RETURN_IF_ERROR(ShuffleEinsumTensor(params, &input_b, &descriptor_b,
5882 /*op_instance=*/1));
5883 if (params->validation_only) return Status::OK();
5884
5885 StatusOr<ITensorProxyPtr> result = ConvertMatMulImpl(
5886 params, *input_a, *input_b, descriptor_a.layout == EinsumLayout::BCF,
5887 descriptor_b.layout == EinsumLayout::BFC);
5888 TF_RETURN_IF_ERROR(result.status());
5889 ITensorProxyPtr output = result.ValueOrDie();
5890
5891 TF_RETURN_IF_ERROR(ShuffleEinsumOutput(params, descriptor_a, descriptor_b,
5892 final_transpose, &output));
5893 params->outputs->push_back(TRT_TensorOrWeights(output));
5894 return Status::OK();
5895 }
5896 #endif // IS_TRT_VERSION_GE(7, 1, 3, 0)
5897
ConvertSoftmax(OpConverterParams * params)5898 Status ConvertSoftmax(OpConverterParams* params) {
5899 const auto& inputs = params->inputs;
5900 const auto& node_def = params->node_def;
5901 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}}));
5902 TF_RETURN_IF_ERROR(
5903 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5904 ITensorProxyPtr tensor = inputs.at(0).tensor();
5905
5906 const int num_trt_dims = tensor->getDimensions().nbDims;
5907 if (num_trt_dims == 0 && params->use_implicit_batch) {
5908 return errors::InvalidArgument(
5909 "TensorRT Softmax cannot apply on batch dimension, at",
5910 node_def.name());
5911 }
5912 if (params->validation_only) return Status::OK();
5913
5914 nvinfer1::ISoftMaxLayer* layer =
5915 params->converter->network()->addSoftMax(*tensor->trt_tensor());
5916 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5917 params->converter->SetLayerName(layer, node_def);
5918 // Tensorflow SoftMax assumes applying softmax on the last dimension.
5919 layer->setAxes(1 << (num_trt_dims - 1));
5920
5921 ITensorProxyPtr output_tensor = layer->getOutput(0);
5922 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5923 return Status::OK();
5924 }
5925
ConvertArgMinMax(OpConverterParams * params)5926 Status ConvertArgMinMax(OpConverterParams* params) {
5927 const auto& inputs = params->inputs;
5928 const auto& node_def = params->node_def;
5929 TF_RETURN_IF_ERROR(
5930 CheckInputsWeights(*params, {{"input", false}, {"dimension", true}}));
5931 TF_RETURN_IF_ERROR(
5932 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5933 // INT64 outputs are not supported by TRT.
5934 TFAttrs attrs(node_def);
5935 DataType output_dtype = attrs.get<DataType>("output_type");
5936 if (output_dtype != DataType::DT_INT32) {
5937 return errors::Unimplemented("Output type ", DataTypeString(output_dtype),
5938 " is not supported, at ", node_def.name());
5939 }
5940 int tf_axis = inputs.at(1).weights().GetSpan<int>()[0];
5941 int trt_axis;
5942 nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
5943 TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
5944 params->use_implicit_batch, &trt_axis));
5945 nvinfer1::TopKOperation topk_op;
5946 if (node_def.op() == "ArgMin") {
5947 topk_op = nvinfer1::TopKOperation::kMIN;
5948 } else if (node_def.op() == "ArgMax") {
5949 topk_op = nvinfer1::TopKOperation::kMAX;
5950 } else {
5951 return errors::InvalidArgument("Unsupported ArgMin/Max operation");
5952 }
5953
5954 #if !IS_TRT_VERSION_GE(7, 0, 0, 11)
5955 const nvinfer1::Dims trt_dims = params->inputs.at(0).GetTrtDims();
5956 if (trt_dims.nbDims >= 4) {
5957 string trt_dim_str = DebugString(trt_dims);
5958
5959 return errors::Unimplemented(node_def.op(), "op is not able to support",
5960 " tensors with 4+ dimensions (excluding batch",
5961 " size). Received: ", trt_dim_str);
5962 }
5963 #endif
5964
5965 if (params->validation_only) return Status::OK();
5966
5967 // Use TopK with k = 1. Only indices output is needed (output 1).
5968 const uint32_t reduce_axes = 1 << trt_axis;
5969 nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
5970 *inputs.at(0).tensor()->trt_tensor(), topk_op, 1, reduce_axes);
5971 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5972 params->converter->SetLayerName(layer, node_def, "topk");
5973 ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
5974
5975 // Squeeze on axis.
5976 std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
5977 input_dims[trt_axis] = 0;
5978 ITensorProxyPtr output_tensor = nullptr;
5979 TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
5980 output_indices_tensor, &input_dims, params, &output_tensor));
5981 params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5982
5983 return Status::OK();
5984 }
5985
ConvertTopK(OpConverterParams * params)5986 Status ConvertTopK(OpConverterParams* params) {
5987 const auto& inputs = params->inputs;
5988 const auto& node_def = params->node_def;
5989 TF_RETURN_IF_ERROR(
5990 CheckInputsWeights(*params, {{"input", false}, {"k", true}}));
5991 TF_RETURN_IF_ERROR(
5992 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5993 TFAttrs attrs(node_def);
5994 const bool sorted = attrs.get<bool>("sorted");
5995 if (!sorted) {
5996 // TensorRT only supports sorted output. Although TensorFlow API
5997 // doesn't specify the order of output elements in case sorted=false,
5998 // but it's safer to not convert because the output of TensorRT might
5999 // be different with TensorFlow which can cause confusion.
6000 return errors::InvalidArgument("Only sorted=True is supported, at",
6001 node_def.name());
6002 }
6003
6004 ITensorProxyPtr tensor = inputs.at(0).tensor();
6005 const int num_dims = tensor->getDimensions().nbDims;
6006 if (num_dims == 0) {
6007 return errors::InvalidArgument(
6008 "TensorRT TopK cannot apply on batch dimension, at", node_def.name());
6009 }
6010
6011 TRT_ShapedWeights k_w = inputs.at(1).weights();
6012 if (k_w.count() != 1) {
6013 return errors::InvalidArgument("k value of TopK should be a scalar, at",
6014 node_def.name());
6015 }
6016 // Note that ITopKLayer always have sorted outputs, so we don't need to handle
6017 // the 'sorted' attribute of the node.
6018 if (params->validation_only) return Status::OK();
6019
6020 const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX;
6021 const int k = *(static_cast<int*>(k_w.GetValues()));
6022 const uint32_t reduce_axes = 1 << (num_dims - 1);
6023 nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
6024 *tensor->trt_tensor(), op, k, reduce_axes);
6025 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6026 params->converter->SetLayerName(layer, node_def);
6027
6028 ITensorProxyPtr output_value_tensor = layer->getOutput(0);
6029 ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
6030 params->outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
6031 params->outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
6032 return Status::OK();
6033 }
6034
6035 StatusOr<std::pair<ITensorProxyPtr, ITensorProxyPtr>>
CalcDepthSpaceDynamicShape(OpConverterParams * params,int block_size,string data_format)6036 CalcDepthSpaceDynamicShape(OpConverterParams* params, int block_size,
6037 string data_format) {
6038 // Instead we use a shape layer and shape arithmetic to calculate the reshape
6039 // dimensions.
6040 const auto& inputs = params->inputs;
6041 const auto& node_def = params->node_def;
6042
6043 const int channels_axis = data_format == "NCHW" ? 1 : 3;
6044 const int h_axis = data_format == "NCHW" ? 2 : 1;
6045 const int w_axis = data_format == "NCHW" ? 3 : 2;
6046
6047 // Get shapes.
6048 ITensorProxyPtr shape = params->converter->network()
6049 ->addShape(*inputs.at(0).tensor()->trt_tensor())
6050 ->getOutput(0);
6051 ITensorProxyPtr batch_size =
6052 params->converter->network()
6053 ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}})
6054 ->getOutput(0);
6055 ITensorProxyPtr num_channels =
6056 params->converter->network()
6057 ->addSlice(*shape->trt_tensor(), {1, {channels_axis}}, {1, {1}},
6058 {1, {1}})
6059 ->getOutput(0);
6060 ITensorProxyPtr h =
6061 params->converter->network()
6062 ->addSlice(*shape->trt_tensor(), {1, {h_axis}}, {1, {1}}, {1, {1}})
6063 ->getOutput(0);
6064 ITensorProxyPtr w =
6065 params->converter->network()
6066 ->addSlice(*shape->trt_tensor(), {1, {w_axis}}, {1, {1}}, {1, {1}})
6067 ->getOutput(0);
6068 ITensorProxyPtr r;
6069 TF_RETURN_IF_ERROR(CreateScalarConstant(params, block_size, &r));
6070 ITensorProxyPtr r_squared;
6071 TF_RETURN_IF_ERROR(
6072 CreateScalarConstant(params, block_size * block_size, &r_squared));
6073 // Get shuffle parameters.
6074 std::vector<ITensorProxyPtr> first_shuffle_tensors(6, nullptr);
6075 std::vector<ITensorProxyPtr> second_shuffle_tensors(4, nullptr);
6076 if (node_def.op() == "DepthToSpace") {
6077 // First Reshape [N, C, H, W] - > [N, r, r, C/(r*r), H, W].
6078 first_shuffle_tensors[0] = batch_size;
6079 first_shuffle_tensors[1] = r;
6080 first_shuffle_tensors[2] = r;
6081 first_shuffle_tensors[3] =
6082 params->converter->network()
6083 ->addElementWise(*num_channels->trt_tensor(),
6084 *r_squared->trt_tensor(),
6085 nvinfer1::ElementWiseOperation::kDIV)
6086 ->getOutput(0);
6087 first_shuffle_tensors[4] = h;
6088 first_shuffle_tensors[5] = w;
6089 // Second Reshape [N, C/(r*r), H, r, W, r] -> [N, C/(r*r), H * r, W * r].
6090 second_shuffle_tensors[0] = batch_size;
6091 second_shuffle_tensors[1] =
6092 params->converter->network()
6093 ->addElementWise(*num_channels->trt_tensor(),
6094 *r_squared->trt_tensor(),
6095 nvinfer1::ElementWiseOperation::kDIV)
6096 ->getOutput(0);
6097 second_shuffle_tensors[2] =
6098 params->converter->network()
6099 ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6100 nvinfer1::ElementWiseOperation::kPROD)
6101 ->getOutput(0);
6102 second_shuffle_tensors[3] =
6103 params->converter->network()
6104 ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6105 nvinfer1::ElementWiseOperation::kPROD)
6106 ->getOutput(0);
6107 } else if (node_def.op() == "SpaceToDepth") {
6108 // First Reshape [N, C, H, W] -> [N, C, H/r, r, W/r, r].
6109 first_shuffle_tensors[0] = batch_size;
6110 first_shuffle_tensors[1] = num_channels;
6111 first_shuffle_tensors[2] =
6112 params->converter->network()
6113 ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6114 nvinfer1::ElementWiseOperation::kDIV)
6115 ->getOutput(0);
6116 first_shuffle_tensors[3] = r;
6117 first_shuffle_tensors[4] =
6118 params->converter->network()
6119 ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6120 nvinfer1::ElementWiseOperation::kDIV)
6121 ->getOutput(0);
6122 first_shuffle_tensors[5] = r;
6123
6124 // Second Reshape [N, r, r, C, H/r, W/r] -> [N, C*r*r, H/r, W/r].
6125 second_shuffle_tensors[0] = batch_size;
6126 second_shuffle_tensors[1] =
6127 params->converter->network()
6128 ->addElementWise(*num_channels->trt_tensor(),
6129 *r_squared->trt_tensor(),
6130 nvinfer1::ElementWiseOperation::kPROD)
6131 ->getOutput(0);
6132 second_shuffle_tensors[2] =
6133 params->converter->network()
6134 ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6135 nvinfer1::ElementWiseOperation::kDIV)
6136 ->getOutput(0);
6137 second_shuffle_tensors[3] =
6138 params->converter->network()
6139 ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6140 nvinfer1::ElementWiseOperation::kDIV)
6141 ->getOutput(0);
6142 }
6143
6144 StatusOr<ITensorProxyPtr> result =
6145 ConcatenateTensors(params, first_shuffle_tensors, 0);
6146 TF_RETURN_IF_ERROR(result.status());
6147 ITensorProxyPtr first_shuffle_shape = result.ValueOrDie();
6148
6149 result = ConcatenateTensors(params, second_shuffle_tensors, 1);
6150 TF_RETURN_IF_ERROR(result.status());
6151 ITensorProxyPtr second_shuffle_shape = result.ValueOrDie();
6152
6153 return std::make_pair(first_shuffle_shape, second_shuffle_shape);
6154 }
6155
ConvertDepthSpaceShuffle(OpConverterParams * params)6156 Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
6157 const auto& inputs = params->inputs;
6158 const auto& node_def = params->node_def;
6159 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
6160 TF_RETURN_IF_ERROR(AllowDataTypes(
6161 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
6162 TFAttrs attrs(node_def);
6163 const int block_size = attrs.get<int64>("block_size");
6164 if (block_size < 2) {
6165 return errors::InvalidArgument("Block size must be 2 or greater, at ",
6166 node_def.name());
6167 }
6168 const string data_format = attrs.get<string>("data_format");
6169 if (data_format != "NCHW" && data_format != "NHWC") {
6170 return errors::Unimplemented("Data format ", data_format,
6171 " is not supported, at ", node_def.name());
6172 }
6173 int idx_offset = params->use_implicit_batch ? 0 : 1;
6174 nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
6175 const int required_rank = 3 + idx_offset;
6176 if (dims.nbDims != required_rank) {
6177 return errors::InvalidArgument("The input to ", node_def.op(),
6178 " must be rank 4, at ", node_def.name());
6179 }
6180 const int num_channels =
6181 data_format == "NCHW" ? dims.d[0 + idx_offset] : dims.d[2 + idx_offset];
6182 const int h =
6183 data_format == "NCHW" ? dims.d[1 + idx_offset] : dims.d[0 + idx_offset];
6184 const int w =
6185 data_format == "NCHW" ? dims.d[2 + idx_offset] : dims.d[1 + idx_offset];
6186 // Get shuffle parameters.
6187 nvinfer1::Dims first_shuffle_shape;
6188 nvinfer1::Permutation transpose_perm;
6189 nvinfer1::Dims second_shuffle_shape;
6190
6191 // We define all the shuffle and transpose dimensions assuming implicit batch
6192 // mode. Afterwards we will update them to explicit batch mode if needed.
6193 // Additionally, an NCHW layout is assumed, and this assumption is corrected
6194 // afterwards with an initial transpose op. TODO(tfeher): Get rid of the
6195 // layout_transpose ops by defining shuffle shape specifically for NCHW and
6196 // NHCW.
6197 if (node_def.op() == "DepthToSpace") {
6198 if (num_channels != -1 && num_channels % (block_size * block_size) != 0) {
6199 return errors::InvalidArgument(
6200 "Number of channels must be divisible by block_size*block_size, at ",
6201 node_def.name());
6202 }
6203 // First Reshape [C, H, W] - > [r, r, C/(r*r), H, W]
6204 first_shuffle_shape = {
6205 /*nbDims=*/5,
6206 /*d=*/{block_size, block_size, num_channels / (block_size * block_size),
6207 h, w}};
6208 // Transpose [r, r, C/(r*r), H, W] -> [C/(r*r), H, r, W, r]
6209 transpose_perm = {2, 3, 0, 4, 1};
6210 // Second Reshape [C/(r*r), H, r, W, r] -> [C/(r*r), H * r, W * r]
6211 second_shuffle_shape =
6212 nvinfer1::Dims3(num_channels / (block_size * block_size),
6213 h * block_size, w * block_size);
6214 } else {
6215 if (node_def.op() != "SpaceToDepth")
6216 return errors::InvalidArgument("Incorrect op type ", node_def.op());
6217 if ((h != -1 && h % block_size != 0) || (w != -1 && w % block_size != 0)) {
6218 return errors::InvalidArgument(
6219 "Width and height must be divisible by block_size, at ",
6220 node_def.name());
6221 }
6222 // First Reshape [C, H, W] -> [C, H/r, r, W/r, r]
6223 first_shuffle_shape = {/*nbDims=*/5,
6224 /*d=*/{num_channels, h / block_size, block_size,
6225 w / block_size, block_size}};
6226 // Transpose [C, H/r, r, W/r, r] -> [r, r, C, H/r, W/r]
6227 transpose_perm = {2, 4, 0, 1, 3};
6228 // Second Reshape [r, r, C, H/r, W/r] -> [C*r*r, H/r, W/r]
6229 second_shuffle_shape = nvinfer1::Dims3(
6230 num_channels * block_size * block_size, h / block_size, w / block_size);
6231 }
6232 if (params->validation_only) return Status::OK();
6233
6234 nvinfer1::IShuffleLayer* first_shuffle =
6235 params->converter->network()->addShuffle(
6236 *inputs.at(0).tensor()->trt_tensor());
6237 TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name());
6238 params->converter->SetLayerName(first_shuffle, node_def, "shuffle",
6239 /*op_instance=*/0);
6240
6241 ITensorProxyPtr second_shuffle_shape_tensor;
6242
6243 if (HasStaticShape(inputs.at(0).GetTrtDims())) {
6244 // Adjust a reshape constructed at implicit batch mode for explicit batch
6245 // mode. In particular, we need to insert the batch dimension size to the
6246 // beginning of all the dimension sizes. Example: reshape {20,10,30} for
6247 // implicit batch mode becomes reshape {N,20,10,30} for explicit batch mode.
6248 auto adjust_reshape = [](int N, nvinfer1::Dims dims,
6249 bool use_implicit_batch) {
6250 if (use_implicit_batch) return dims;
6251 for (int i = dims.nbDims; i > 0; i--) {
6252 dims.d[i] = dims.d[i - 1];
6253 }
6254 dims.d[0] = N;
6255 dims.nbDims++;
6256 return dims;
6257 };
6258
6259 first_shuffle_shape = adjust_reshape(dims.d[0], first_shuffle_shape,
6260 params->use_implicit_batch);
6261 second_shuffle_shape = adjust_reshape(dims.d[0], second_shuffle_shape,
6262 params->use_implicit_batch);
6263
6264 first_shuffle->setReshapeDimensions(first_shuffle_shape);
6265 } else {
6266 StatusOr<std::pair<ITensorProxyPtr, ITensorProxyPtr>> result =
6267 CalcDepthSpaceDynamicShape(params, block_size, data_format);
6268 TF_RETURN_IF_ERROR(result.status());
6269 first_shuffle->setInput(1, *result.ValueOrDie().first->trt_tensor());
6270 second_shuffle_shape_tensor = result.ValueOrDie().second;
6271 }
6272
6273 // Adjust a transpose constructed assuming implicit batch mode for explicit
6274 // batch mode. In particular, we need to add the batch dimension to d0 and
6275 // add 1 to all the dimension id in the transpose. Example: permutation
6276 // for implicit batch mode becomes permutation {0,3,2,1} for explicit batch
6277 // mode.
6278 auto adjust_perm = [](int n, nvinfer1::Permutation perm,
6279 bool use_implicit_batch) {
6280 if (use_implicit_batch) return perm;
6281 for (int i = n; i > 0; i--) {
6282 perm.order[i] = perm.order[i - 1] + 1;
6283 }
6284 perm.order[0] = 0;
6285 return perm;
6286 };
6287 transpose_perm = adjust_perm(5, transpose_perm, params->use_implicit_batch);
6288
6289 if (data_format == "NHWC") {
6290 nvinfer1::Permutation layout_transpose =
6291 adjust_perm(3, {2, 0, 1}, params->use_implicit_batch);
6292 first_shuffle->setFirstTranspose(layout_transpose);
6293 }
6294 first_shuffle->setSecondTranspose(transpose_perm);
6295
6296 nvinfer1::IShuffleLayer* second_shuffle =
6297 params->converter->network()->addShuffle(*first_shuffle->getOutput(0));
6298 TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name());
6299 params->converter->SetLayerName(second_shuffle, node_def, "shuffle",
6300 /*op_instance=*/1);
6301
6302 if (HasStaticShape(inputs.at(0).GetTrtDims())) {
6303 second_shuffle->setReshapeDimensions(second_shuffle_shape);
6304 } else {
6305 second_shuffle->setInput(1, *second_shuffle_shape_tensor->trt_tensor());
6306 }
6307 if (data_format == "NHWC") {
6308 nvinfer1::Permutation layout_transpose =
6309 adjust_perm(3, {1, 2, 0}, params->use_implicit_batch);
6310 second_shuffle->setSecondTranspose(layout_transpose);
6311 }
6312
6313 params->outputs->push_back(TRT_TensorOrWeights(second_shuffle->getOutput(0)));
6314 return Status::OK();
6315 }
6316
ConvertSquaredDifference(OpConverterParams * params)6317 Status ConvertSquaredDifference(OpConverterParams* params) {
6318 TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}}));
6319 TF_RETURN_IF_ERROR(
6320 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6321 const auto& inputs = params->inputs;
6322 const auto& node_def = params->node_def;
6323 // Broadcast inputs.
6324 nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
6325 TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
6326 inputs.at(0), inputs.at(1), /*check_feasibility=*/true,
6327 params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
6328 ITensorProxyPtr tensor_l = nullptr;
6329 ITensorProxyPtr tensor_r = nullptr;
6330 TF_RETURN_IF_ERROR(
6331 PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l,
6332 params->validation_only, &tensor_l, node_def));
6333 TF_RETURN_IF_ERROR(
6334 PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r,
6335 params->validation_only, &tensor_r, node_def));
6336 if (params->validation_only) return Status::OK();
6337
6338 // Subtract x - y.
6339 nvinfer1::IElementWiseLayer* sub =
6340 params->converter->network()->addElementWise(
6341 *tensor_l->trt_tensor(), *tensor_r->trt_tensor(),
6342 nvinfer1::ElementWiseOperation::kSUB);
6343 TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
6344 params->converter->SetLayerName(sub, node_def, "sub");
6345
6346 // Multiply (x - y) * (x - y).
6347 nvinfer1::IElementWiseLayer* mul =
6348 params->converter->network()->addElementWise(
6349 *sub->getOutput(0), *sub->getOutput(0),
6350 nvinfer1::ElementWiseOperation::kPROD);
6351 TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
6352 params->converter->SetLayerName(mul, node_def, "mul");
6353
6354 params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
6355 return Status::OK();
6356 }
6357
6358 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6359
AllowNmsTopkOverride()6360 bool AllowNmsTopkOverride() {
6361 static bool result = [] {
6362 bool value;
6363 Status status = ReadBoolFromEnvVar("TF_TRT_ALLOW_NMS_TOPK_OVERRIDE",
6364 /*default_value=*/false, &value);
6365 if (!status.ok()) {
6366 LOG(ERROR) << status;
6367 }
6368 return value;
6369 }();
6370 return result;
6371 }
6372
ConvertCombinedNMS(OpConverterParams * params)6373 Status ConvertCombinedNMS(OpConverterParams* params) {
6374 TF_RETURN_IF_ERROR(
6375 CheckInputsWeights(*params, {{"boxes", false},
6376 {"scores", false},
6377 {"max_output_size_per_class", true},
6378 {"max_total_size", true},
6379 {"iou_threshold", true},
6380 {"score_threshold", true}}));
6381 const auto& inputs = params->inputs;
6382 const auto& node_def = params->node_def;
6383
6384 ITensorProxyPtr boxes_tensor = inputs.at(0).tensor();
6385 ITensorProxyPtr scores_tensor = inputs.at(1).tensor();
6386 TRT_ShapedWeights output_size_per_class = inputs.at(2).weights();
6387 TRT_ShapedWeights total_size = inputs.at(3).weights();
6388 TRT_ShapedWeights iou_threshold = inputs.at(4).weights();
6389 TRT_ShapedWeights score_threshold = inputs.at(5).weights();
6390
6391 // Validate tensors and weights (also set some of the needed plugin fields)
6392 const auto boxes_dims = boxes_tensor->getDimensions();
6393 const auto scores_dims = scores_tensor->getDimensions();
6394 if (!params->use_implicit_batch &&
6395 (!HasStaticShape(boxes_dims) || !HasStaticShape(scores_dims))) {
6396 return errors::Unimplemented(
6397 "TensorRT BatchedNMS Plugin requires input with static shape");
6398 }
6399 const int offset = params->use_implicit_batch ? 0 : 1;
6400 if (boxes_dims.nbDims != 3 + offset) {
6401 return errors::InvalidArgument(
6402 "TensorRT BatchedNMS Plugin input boxes must be 4-D including batch ",
6403 node_def.name());
6404 }
6405 const int class_idx = 1 + offset;
6406 const int num_classes = scores_dims.d[class_idx];
6407 const int num_boxes = boxes_dims.d[0 + offset];
6408 bool box_check =
6409 boxes_dims.d[class_idx] == 1 || boxes_dims.d[class_idx] == num_classes;
6410 if (!box_check) {
6411 return errors::InvalidArgument(
6412 "TensorRT BatchedNMS Plugin third dimension of boxes must be either 1 "
6413 "or num_classes ",
6414 node_def.name());
6415 }
6416
6417 if (output_size_per_class.count() != 1) {
6418 return errors::InvalidArgument(
6419 "TensorRT BatchedNMS Plugin max_output_size_per_class must be scalar ",
6420 node_def.name());
6421 }
6422 int max_size_per_class =
6423 *(static_cast<int*>(output_size_per_class.GetValues()));
6424 if (max_size_per_class <= 0) {
6425 return errors::InvalidArgument(
6426 "TensorRT BatchedNMS Plugin max_output_size_per_class should be > 0",
6427 node_def.name());
6428 }
6429 if (total_size.count() != 1) {
6430 return errors::InvalidArgument(
6431 "TensorRT BatchedNMS Plugin max_total_size must be scalar ",
6432 node_def.name());
6433 }
6434 int max_total_size = *(static_cast<int*>(total_size.GetValues()));
6435 if (max_total_size <= 0) {
6436 return errors::InvalidArgument(
6437 "TensorRT BatchedNMS Plugin max_total_size should be > 0",
6438 node_def.name());
6439 }
6440 if (iou_threshold.count() != 1) {
6441 return errors::InvalidArgument(
6442 "TensorRT BatchedNMS Plugin iou_threshold must be scalar ",
6443 node_def.name());
6444 }
6445 float iou_thresh = *(static_cast<float*>(iou_threshold.GetValues()));
6446 if (iou_thresh < 0.0 || iou_thresh > 1.0) {
6447 return errors::InvalidArgument(
6448 "TensorRT BatchedNMS Plugin iou_threshold must be in [0, 1]",
6449 node_def.name());
6450 }
6451 if (score_threshold.count() != 1) {
6452 return errors::InvalidArgument(
6453 "TensorRT BatchedNMS Plugin score_threshold must be scalar ",
6454 node_def.name());
6455 }
6456
6457 // TRT op is_normalized=False treats input corrdinates as pixels and
6458 // calculates width/height as (max - min + 1).
6459 //
6460 // TF op CombinedNonMaxSuppression doesn't care about the normalization and
6461 // calculates width/height as (max-min).
6462 //
6463 // We set is_normalized = true to be consistent with TF IOU calculaton.
6464 const bool is_normalized = true;
6465
6466 TFAttrs attrs(node_def);
6467 bool share_location = (boxes_dims.d[class_idx] == 1);
6468 const bool pad_per_class = attrs.get<bool>("pad_per_class");
6469 const bool clip_boxes = attrs.get<bool>("clip_boxes");
6470 int keep_top_k = 0;
6471 if (pad_per_class) {
6472 keep_top_k = std::min(max_size_per_class * num_classes, max_total_size);
6473 } else {
6474 keep_top_k = max_total_size;
6475 }
6476
6477 // According to the batchedNMS plugin description we need to set top_k so that
6478 // keep_top_k <= top_k
6479 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
6480 // Before the NMS step, TRT selects top_k candidate from each class and
6481 // discards the rest. The NMS step is performed only among the top_k
6482 // candidates. To be strictly compatible with the TF op, we need that top_k is
6483 // greater equal to num_boxes.
6484 int top_k = std::max(num_boxes, keep_top_k);
6485 // TRT has a limitation: top_k <=4096.
6486 if (top_k > 4096) {
6487 if (AllowNmsTopkOverride()) {
6488 top_k = 4096;
6489 keep_top_k = std::min(top_k, keep_top_k);
6490 } else {
6491 return errors::InvalidArgument(
6492 "TRT NMS plugin allow top_k<=4096, where top_k = max(num_boxes, "
6493 "max_total_size). You can override this by setting "
6494 "TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 environment variable, but this can "
6495 "result in a loss of accuracy.");
6496 }
6497 }
6498
6499 if (params->validation_only) return Status::OK();
6500 float score_thresh = *(static_cast<float*>(score_threshold.GetValues()));
6501 const int background_id = -1;
6502 nvinfer1::PluginField fields[9] = {
6503 nvinfer1::PluginField{"shareLocation", &share_location,
6504 nvinfer1::PluginFieldType::kINT32, 1},
6505 nvinfer1::PluginField{"backgroundLabelId", &background_id,
6506 nvinfer1::PluginFieldType::kINT32, 1},
6507 nvinfer1::PluginField{"numClasses", &num_classes,
6508 nvinfer1::PluginFieldType::kINT32, 1},
6509 nvinfer1::PluginField{"topK", &top_k, nvinfer1::PluginFieldType::kINT32,
6510 1},
6511 nvinfer1::PluginField{"keepTopK", &keep_top_k,
6512 nvinfer1::PluginFieldType::kINT32, 1},
6513 nvinfer1::PluginField{"scoreThreshold", &score_thresh,
6514 nvinfer1::PluginFieldType::kFLOAT32, 1},
6515 nvinfer1::PluginField{"iouThreshold", &iou_thresh,
6516 nvinfer1::PluginFieldType::kFLOAT32, 1},
6517 nvinfer1::PluginField{"isNormalized", &is_normalized,
6518 nvinfer1::PluginFieldType::kINT32, 1},
6519 nvinfer1::PluginField{"clipBoxes", &clip_boxes,
6520 nvinfer1::PluginFieldType::kINT32, 1}};
6521 nvinfer1::PluginFieldCollection fc{9, fields};
6522
6523 // Get plugin creator
6524 auto creator =
6525 getPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1", "");
6526 TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_def.name());
6527
6528 // Create plugin
6529 TrtUniquePtrType<nvinfer1::IPluginV2> plugin(
6530 creator->createPlugin(node_def.name().c_str(), &fc));
6531 TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_def.name());
6532
6533 // Set plugin inputs
6534 std::vector<nvinfer1::ITensor*> trt_plugin_inputs;
6535 trt_plugin_inputs.push_back(boxes_tensor->trt_tensor());
6536 trt_plugin_inputs.push_back(scores_tensor->trt_tensor());
6537
6538 // Add plugin to network
6539 nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2(
6540 &trt_plugin_inputs[0], static_cast<int>(trt_plugin_inputs.size()),
6541 *plugin);
6542 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6543 params->converter->SetLayerName(layer, node_def, "plugin");
6544
6545 // Set plugin outputs
6546 ITensorProxyPtr output_nmsed_boxes = layer->getOutput(1);
6547
6548 // TensorRT fixes (removes) the extra last dimension in CombinedNMS outputs
6549 ITensorProxyPtr output_num_detections = layer->getOutput(0);
6550 ITensorProxyPtr output_nmsed_scores = layer->getOutput(2);
6551 ITensorProxyPtr output_nmsed_classes = layer->getOutput(3);
6552
6553 params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
6554 params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_scores));
6555 params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_classes));
6556 params->outputs->push_back(TRT_TensorOrWeights(output_num_detections));
6557
6558 return Status::OK();
6559 }
6560 #endif // IS_TRT_VERSION_GE(7, 1, 3, 0)
6561
ConvertResize(OpConverterParams * params)6562 Status ConvertResize(OpConverterParams* params) {
6563 const auto& inputs = params->inputs;
6564 const auto& node_def = params->node_def;
6565 TF_RETURN_IF_ERROR(
6566 CheckInputsWeights(*params, {{"input", false}, {"size", true}}));
6567 TF_RETURN_IF_ERROR(AllowDataTypes(
6568 *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
6569
6570 // Get input tensor. Transpose it from NHWC to NCHW.
6571 ITensorProxyPtr inputs_tensor = inputs.at(0).tensor();
6572
6573 TFTRT_RETURN_ERROR_IF_NULLPTR(inputs_tensor, params->node_def.name());
6574
6575 // Get output size. It must constain two values i.e. [H_out, W_out]
6576 TRT_ShapedWeights weights = inputs.at(1).weights();
6577 if (weights.count() != 2) {
6578 return errors::Unimplemented("Resize to shape=[] is not supported, at ",
6579 node_def.name());
6580 }
6581 const int* weights_ptr = static_cast<int*>(weights.GetValues());
6582
6583 // Verify and consume node attributes.
6584 TFAttrs attrs(node_def);
6585 bool align_corners = attrs.get<bool>("align_corners");
6586 TF_RETURN_IF_ERROR(
6587 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6588
6589 // Verify resize mode. Initialize resize mode if supported.
6590 nvinfer1::ResizeMode resize_mode;
6591 if (node_def.op() == "ResizeBilinear") {
6592 #if IS_TRT_VERSION_GE(7, 1, 0, 0)
6593 if (!align_corners) {
6594 return errors::InvalidArgument(
6595 "Cannot Convert Bilinear Resize when align_corners=False");
6596 }
6597 #endif
6598 resize_mode = nvinfer1::ResizeMode::kLINEAR;
6599 } else if (node_def.op() == "ResizeNearestNeighbor") {
6600 resize_mode = nvinfer1::ResizeMode::kNEAREST;
6601 } else {
6602 return errors::Unimplemented(node_def.op(), " is not yet implemented at ",
6603 node_def.name());
6604 }
6605
6606 // Validate inputs_tensor.
6607 // TODO: Allow dynamic shape for input-1 when shape input tensors are handled.
6608 const auto inputs_dims = inputs_tensor->getDimensions();
6609 if (!params->use_implicit_batch && !HasStaticShape(inputs_dims)) {
6610 return errors::Unimplemented(
6611 "TensorRT IResizeLayer requires input with static shape");
6612 }
6613
6614 // return after validation if only validation is requested.
6615 if (params->validation_only) return Status::OK();
6616
6617 // Transpose tensor from NHWC to NCHW format.
6618 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
6619 inputs_tensor, {0, 3, 1, 2}, &inputs_tensor, node_def, "to_NCHW"));
6620
6621 // Calculate output dimensions.
6622 // Given input dimensions [N, C, H, W] and output size [H_out, W_out],
6623 // output dimensions equals [N, C, H_out, W_out]
6624 nvinfer1::Dims output_dimensions;
6625 output_dimensions.nbDims = inputs_tensor->getDimensions().nbDims;
6626 for (int i = 0; i < output_dimensions.nbDims; ++i) {
6627 output_dimensions.d[i] = inputs_tensor->getDimensions().d[i];
6628 }
6629 output_dimensions.d[output_dimensions.nbDims - 2] = weights_ptr[0];
6630 output_dimensions.d[output_dimensions.nbDims - 1] = weights_ptr[1];
6631
6632 // Add resize layer.
6633 nvinfer1::IResizeLayer* layer =
6634 params->converter->network()->addResize(*inputs_tensor->trt_tensor());
6635 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6636 params->converter->SetLayerName(layer, node_def);
6637
6638 // Set layer parameters.
6639 layer->setResizeMode(resize_mode);
6640 layer->setOutputDimensions(output_dimensions);
6641 layer->setAlignCorners(align_corners);
6642
6643 // Get output tensor. Transpose it from NCHW to NHWC.
6644 ITensorProxyPtr output = layer->getOutput(0);
6645
6646 TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
6647 output, {0, 2, 3, 1}, &output, node_def, "to_NHWC"));
6648 params->outputs->push_back(TRT_TensorOrWeights(output));
6649 // Success
6650 return Status::OK();
6651 } // ConvertResize
6652
ConvertAddN(OpConverterParams * params)6653 Status ConvertAddN(OpConverterParams* params) {
6654 const auto& inputs = params->inputs;
6655 const auto& node_def = params->node_def;
6656 TF_RETURN_IF_ERROR(
6657 AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6658 TFAttrs attrs(node_def);
6659 const int num_inputs = attrs.get<int64>("N");
6660 if (num_inputs < 2) {
6661 return errors::InvalidArgument("AddN requires at least two inputs, at ",
6662 node_def.name());
6663 }
6664 if (inputs.size() != num_inputs) {
6665 return errors::InvalidArgument("Got ", inputs.size(),
6666 " inputs but expected ", num_inputs, ", at ",
6667 node_def.name());
6668 }
6669 for (const auto& input : inputs) {
6670 if (!input.is_tensor() && input.weights().shape_.d[0] != 1) {
6671 return errors::InvalidArgument(
6672 "Weights input to AddN is required to have batch dimension 1.");
6673 }
6674 }
6675 if (params->validation_only) return Status::OK();
6676
6677 // AddN doesn't support broadcast.
6678 std::vector<ITensorProxyPtr> tensor_inputs;
6679 for (const auto& input : inputs) {
6680 if (input.is_tensor()) {
6681 tensor_inputs.push_back(input.tensor());
6682 } else {
6683 auto dims = input.weights().shape_;
6684 TF_RETURN_IF_ERROR(RemoveBatchDimension(&dims));
6685 tensor_inputs.push_back(
6686 params->converter->CreateConstantLayer(input.weights(), dims));
6687 }
6688 }
6689 ITensorProxyPtr lhs = tensor_inputs[0];
6690 for (int i = 1; i < num_inputs; ++i) {
6691 ITensorProxyPtr rhs = tensor_inputs[i];
6692 nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
6693 *lhs->trt_tensor(), *rhs->trt_tensor(),
6694 nvinfer1::ElementWiseOperation::kSUM);
6695 TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6696 params->converter->SetLayerName(layer, node_def, std::to_string(i));
6697 lhs = layer->getOutput(0);
6698 }
6699 params->outputs->push_back(TRT_TensorOrWeights(lhs));
6700 return Status::OK();
6701 }
6702
RegisterValidatableOpConverters(std::unordered_map<string,OpConverter> * registration)6703 static void RegisterValidatableOpConverters(
6704 std::unordered_map<string, OpConverter>* registration) {
6705 (*registration)["BiasAdd"] = ConvertBiasAdd;
6706 (*registration)["ClipByValue"] = ConvertClipByValue;
6707 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6708 (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
6709 #endif
6710 (*registration)["AddN"] = ConvertAddN;
6711 (*registration)["Cast"] = ConvertCast;
6712 (*registration)["ConcatV2"] = ConvertConcat;
6713 (*registration)["Const"] = ConvertConst;
6714 (*registration)["Conv2D"] = ConvertConv2D;
6715 (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
6716 (*registration)["DepthToSpace"] = ConvertDepthSpaceShuffle;
6717 (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
6718 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6719 (*registration)["Einsum"] = ConvertEinsum;
6720 #endif
6721 (*registration)["ExpandDims"] = ConvertExpandDims;
6722 (*registration)["FusedConv2DBiasActivation"] =
6723 ConvertFusedConv2DBiasActivation;
6724 (*registration)["GatherV2"] = ConvertGather;
6725 (*registration)["LeakyRelu"] = ConvertLeakyRelu;
6726 (*registration)["MatMul"] = ConvertMatMul;
6727 (*registration)["Pack"] = ConvertPack;
6728 (*registration)["Pad"] = ConvertPad;
6729 (*registration)["Relu6"] = ConvertRelu6;
6730 (*registration)["Reshape"] = ConvertReshape;
6731 (*registration)["Conv3D"] = ConvertConv3D;
6732 (*registration)["Conv3DBackpropInputV2"] = ConvertConv3DBackpropInputV2;
6733 for (auto resize_mode : {"ResizeBilinear", "ResizeNearestNeighbor"}) {
6734 (*registration)[resize_mode] = ConvertResize;
6735 }
6736 for (auto pool_op_type : {"AvgPool3D", "MaxPool3D"}) {
6737 (*registration)[pool_op_type] = ConvertPool3D;
6738 }
6739 (*registration)["Shape"] = ConvertShape;
6740 (*registration)["Rsqrt"] = ConvertRsqrt;
6741 (*registration)["Slice"] = ConvertSlice;
6742 (*registration)["Softmax"] = ConvertSoftmax;
6743 (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle;
6744 (*registration)["Split"] = ConvertSplit;
6745 (*registration)["Square"] = ConvertSquare;
6746 (*registration)["SquaredDifference"] = ConvertSquaredDifference;
6747 (*registration)["Squeeze"] = ConvertSqueeze;
6748 (*registration)["StridedSlice"] = ConvertStridedSlice;
6749 (*registration)["TopKV2"] = ConvertTopK;
6750 (*registration)["Transpose"] = ConvertTranspose;
6751 (*registration)["Unpack"] = ConvertUnpack;
6752 (*registration)["_CopyFromHostToGpu"] = ConvertIdentity;
6753 for (auto quantization_op_type : *TrtNodeValidator::quantize_ops) {
6754 (*registration)[quantization_op_type] = ConvertQuantize;
6755 }
6756 for (const auto& binary_op_pair : *BinaryOperationMap()) {
6757 (*registration)[binary_op_pair.first] = ConvertBinary;
6758 }
6759 for (const auto& activation_op_pair : *ActivationTypeMap()) {
6760 (*registration)[activation_op_pair.first] = ConvertActivation;
6761 }
6762 for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
6763 (*registration)[pool_op_type] = ConvertPool;
6764 }
6765 for (auto normalization_op_type :
6766 {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3"}) {
6767 (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
6768 }
6769 for (const auto& unary_op_pair : *UnaryOperationMap()) {
6770 (*registration)[unary_op_pair.first] = ConvertUnary;
6771 }
6772 for (auto reduce_op_type : {"Sum", "Prod", "Max", "Min", "Mean"}) {
6773 (*registration)[reduce_op_type] = ConvertReduce;
6774 }
6775 for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) {
6776 (*registration)[arg_minmax_type] = ConvertArgMinMax;
6777 }
6778 // The following are no-ops during inference and will not be mapped to any TRT
6779 // layer.
6780 for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) {
6781 (*registration)[identity_op_type] = ConvertIdentity;
6782 }
6783 for (auto batch_matmul_type : {"BatchMatMul", "BatchMatMulV2"}) {
6784 (*registration)[batch_matmul_type] = ConvertBatchMatMul;
6785 }
6786 }
6787
RegisterOpValidators()6788 void TrtNodeValidator::RegisterOpValidators() {
6789 RegisterValidatableOpConverters(&op_validators_);
6790 }
6791
RegisterOpConverters()6792 void Converter::RegisterOpConverters() {
6793 RegisterValidatableOpConverters(&op_registry_);
6794 }
6795
ConvertGraphDefToEngine(const GraphDef & gdef,TrtPrecisionMode precision_mode,int max_batch_size,size_t max_workspace_size_bytes,const std::vector<PartialTensorShape> & input_shapes,nvinfer1::ILogger * trt_logger,nvinfer1::IGpuAllocator * allocator,TRTInt8Calibrator * calibrator,TrtUniquePtrType<nvinfer1::ICudaEngine> * engine,bool use_calibration,const bool use_implicit_batch,bool * convert_successfully,TrtShapeOptimizationProfile * profiles,absl::string_view engine_name)6796 Status ConvertGraphDefToEngine(
6797 const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
6798 size_t max_workspace_size_bytes,
6799 const std::vector<PartialTensorShape>& input_shapes,
6800 nvinfer1::ILogger* trt_logger, nvinfer1::IGpuAllocator* allocator,
6801 TRTInt8Calibrator* calibrator,
6802 TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
6803 const bool use_implicit_batch, bool* convert_successfully,
6804 TrtShapeOptimizationProfile* profiles, absl::string_view engine_name) {
6805 engine->reset();
6806 if (convert_successfully) *convert_successfully = false;
6807
6808 // Creating converter, TensorRT builder and network
6809 auto statusor = Converter::Create(precision_mode, use_calibration, trt_logger,
6810 use_implicit_batch, engine_name);
6811 TF_RETURN_IF_ERROR(statusor.status());
6812 auto converter = std::move(statusor.ValueOrDie());
6813
6814 VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers";
6815 std::vector<Converter::EngineOutputInfo> output_tensors;
6816 int num_layers = converter->network()->getNbLayers();
6817 absl::flat_hash_set<const char*> layer_names;
6818 // Graph nodes are already topologically sorted during construction
6819 for (const auto& node_def : gdef.node()) {
6820 const string& node_name = node_def.name();
6821 VLOG(2) << "Converting node " << node_name << ", op=" << node_def.op();
6822 if (IsEngineInput(node_name)) {
6823 int32 slot_number = -1;
6824 string type_key;
6825 if (node_def.op() == "Placeholder") {
6826 if (!strings::safe_strto32( // non-absl ok
6827 node_name.c_str() + strlen(IONamePrefixes::kInputPHName),
6828 &slot_number)) {
6829 return errors::InvalidArgument("Failed to parse slot number from ",
6830 node_name);
6831 }
6832 type_key = "dtype";
6833 } else if (tensorflow::grappler::IsArg(node_def)) {
6834 // Maybe remove the dependence on grappler and re-implement IsArg,
6835 // which is pretty simple (but could change if new Arg nodes are added)
6836 slot_number = node_def.attr().at("index").i();
6837 type_key = "T";
6838 } else {
6839 return errors::InvalidArgument(
6840 "Node ", node_name,
6841 " with is neither Placeholder nor Arg, instead ", node_def.op());
6842 }
6843 nvinfer1::DataType trt_dtype;
6844 nvinfer1::Dims trt_dims;
6845 int batch_size = -1;
6846 auto shape = input_shapes.at(slot_number);
6847 auto status = ValidateTensorProperties(
6848 node_def.op(), node_def.attr().at(type_key).type(), shape,
6849 use_implicit_batch, /*validation_only=*/false, &trt_dtype, &trt_dims,
6850 &batch_size);
6851 if (!status.ok()) {
6852 const string error_message =
6853 StrCat("Validation failed for ", node_name, " and input slot ",
6854 slot_number, ": ", status.error_message());
6855 LOG_WARNING_WITH_PREFIX << error_message;
6856 return Status(status.code(), error_message);
6857 }
6858 VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
6859 << DebugString(trt_dims);
6860 // TODO(laigd): the conversion should always happen at runtime where all
6861 // the shapes are known, and we can provide a mode to generate the
6862 // engines offline, by calling sess.run() and cache/serialize the engines.
6863 TF_RETURN_IF_ERROR(converter->AddInputTensor(node_name, trt_dtype,
6864 trt_dims, batch_size));
6865 } else if (IsEngineOutput(node_name)) {
6866 int32 slot_number = -1;
6867 if (node_def.op() == "Identity") {
6868 if (!strings::safe_strto32( // non-absl ok
6869 node_name.c_str() + strlen(IONamePrefixes::kOutputPHName),
6870 &slot_number)) {
6871 return errors::InvalidArgument("Failed to parse slot number from ",
6872 node_name);
6873 }
6874 } else if (tensorflow::grappler::IsRetval(node_def)) {
6875 slot_number = node_def.attr().at("index").i();
6876 } else {
6877 return errors::InvalidArgument(
6878 "Node with name ", node_name,
6879 " starting with IONamePrefixes::kOutputPHName is "
6880 "neither Identity nor Retval, instead ",
6881 node_def.op());
6882 }
6883 // Get output type that TensorFlow expects
6884 TFAttrs attrs(node_def);
6885 DataType tf_dtype = attrs.get<DataType>("T");
6886 nvinfer1::DataType trt_dtype;
6887 TF_RETURN_IF_ERROR(TfTypeToTrtType(tf_dtype, &trt_dtype));
6888 if (output_tensors.size() <= slot_number) {
6889 output_tensors.resize(slot_number + 1);
6890 }
6891 output_tensors.at(slot_number) = {node_def.input(0), node_name,
6892 trt_dtype};
6893 } else {
6894 TF_RETURN_IF_ERROR(converter->ConvertNode(node_def));
6895 }
6896
6897 // To support TF-TRT profiling, we ensure each ILayer has a non-empty name.
6898 // BuildCudaEngine returns an error if there is any ILayer name collision.
6899 // We want to report the error here before BuildCudaEngine in a more
6900 // meaningful way.
6901 int new_num_layers = converter->network()->getNbLayers();
6902 for (int i = num_layers; i < new_num_layers; i++) {
6903 auto layer = converter->network()->getLayer(i);
6904 if (layer->getName() == nullptr ||
6905 !layer_names.insert(layer->getName()).second) {
6906 std::string error_message =
6907 absl::StrCat("Converting node ", node_name, ", op=", node_def.op(),
6908 layer->getName() ? "create a layer with name collision"
6909 : "create a layer without a name");
6910 LOG_WARNING_WITH_PREFIX << error_message;
6911 return errors::Internal(error_message);
6912 }
6913 }
6914 num_layers = new_num_layers;
6915 }
6916 TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors));
6917 if (convert_successfully) *convert_successfully = true;
6918
6919 // Apply user provided quantization ranges to tensors
6920 converter->MaybeApplyQuantizationRanges();
6921
6922 // Build the engine.
6923 TF_RETURN_IF_ERROR(converter->BuildCudaEngine(
6924 engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator,
6925 profiles));
6926
6927 VLOG(1) << "Finished conversion";
6928 return Status::OK();
6929 }
6930
ConvertSegmentToGraphDef(const Graph * graph,const grappler::GraphProperties & graph_properties,const std::vector<const Node * > & subgraph_nodes,EngineInfo * engine_info)6931 Status ConvertSegmentToGraphDef(
6932 const Graph* graph, const grappler::GraphProperties& graph_properties,
6933 const std::vector<const Node*>& subgraph_nodes, // In topological order
6934 EngineInfo* engine_info) {
6935 std::vector<EngineConnection>* connections = &engine_info->connections;
6936 GraphDef* segment_def = &engine_info->segment_graph_def;
6937 bool has_int32_input = false;
6938 std::set<string> marker_nodes;
6939 // Update connection shapes/data types and add corresponding input/output
6940 // nodes in the segment graphdef.
6941 for (size_t i = 0; i < connections->size(); ++i) {
6942 auto& connection = connections->at(i);
6943 if (connection.is_control_edge()) continue;
6944 auto outside_node = graph->FindNodeId(connection.outside_id);
6945 if (!outside_node) {
6946 // This should never happen, unless the original graph is problematic.
6947 return errors::NotFound("Cannot find node with id ",
6948 connection.outside_id, " in the graph.");
6949 }
6950 // Updates the shape and data types of input/output connections.
6951 DataType dtype;
6952 PartialTensorShape partial_shape;
6953 if (connection.is_input_edge) {
6954 GetOutputProperties(graph_properties,
6955 graph->FindNodeId(connection.outside_id),
6956 connection.outside_port, &partial_shape, &dtype);
6957 connection.outside_shape = partial_shape;
6958 } else {
6959 GetInputProperties(graph_properties,
6960 graph->FindNodeId(connection.outside_id),
6961 connection.outside_port, &partial_shape, &dtype);
6962 connection.inside_shape = partial_shape;
6963 }
6964 connection.connection_type = dtype;
6965
6966 // Add dummy input/output nodes to the segment graphdef.
6967 if (connection.is_input_edge) {
6968 if (dtype == DT_INT32 && !has_int32_input) {
6969 has_int32_input = true;
6970 }
6971
6972 const string node_name =
6973 StrCat(IONamePrefixes::kInputPHName, connection.port_number);
6974 if (marker_nodes.count(node_name)) {
6975 VLOG(1) << "Reusing input " << node_name << " for the edge "
6976 << connection.outside_node_name << ":"
6977 << connection.outside_port << " -> "
6978 << connection.inside_node_name << ":" << connection.inside_port;
6979 continue;
6980 }
6981 marker_nodes.insert(node_name);
6982 auto seg_node = segment_def->add_node();
6983 NodeDefBuilder builder(node_name, "_Arg");
6984 auto status = builder.Attr("shape", partial_shape)
6985 .Attr("T", dtype)
6986 .Attr("index", connection.port_number)
6987 .Finalize(seg_node);
6988 VLOG(1) << "Constructing input " << node_name << " for the edge "
6989 << connection.outside_node_name << ":" << connection.outside_port
6990 << " -> " << connection.inside_node_name << ":"
6991 << connection.inside_port;
6992 } else {
6993 const string node_name =
6994 StrCat(IONamePrefixes::kOutputPHName, connection.port_number);
6995 if (marker_nodes.count(node_name)) {
6996 VLOG(1) << "Reusing output " << node_name << " for the edge "
6997 << connection.inside_node_name << ":" << connection.inside_port
6998 << " -> " << connection.outside_node_name << ":"
6999 << connection.outside_port;
7000 continue;
7001 }
7002 marker_nodes.insert(node_name);
7003 auto seg_node = segment_def->add_node();
7004 NodeDefBuilder builder(node_name, "_Retval");
7005 auto status =
7006 builder.Attr("T", dtype)
7007 .Attr("index", connection.port_number)
7008 .Input(connection.inside_node_name, connection.inside_port, dtype)
7009 .Finalize(seg_node);
7010 VLOG(1) << "Constructing output " << node_name << " for the edge "
7011 << connection.inside_node_name << ":" << connection.inside_port
7012 << " -> " << connection.outside_node_name << ":"
7013 << connection.outside_port;
7014 }
7015 } // for each connection.
7016
7017 std::set<string> subgraph_node_names;
7018 for (const Node* node : subgraph_nodes) {
7019 subgraph_node_names.insert(node->name());
7020 }
7021
7022 std::unordered_map<int, int> old_to_new_id_map;
7023 // Copy internal nodes to new graphdef
7024 string local_scope = subgraph_nodes.front()->name();
7025 for (const Node* node : subgraph_nodes) {
7026 local_scope = GetCommonNameScope(local_scope, node->name());
7027 old_to_new_id_map[node->id()] = segment_def->node_size();
7028 auto snode = segment_def->add_node();
7029 *snode = node->def();
7030 if (snode->op() == "Shape") {
7031 const std::string copy_op_name = snode->name();
7032 std::string shape_op_name = copy_op_name + "_cpu_result";
7033
7034 // Add a node to copy the Shape OP output to GPU. Use the Shape OP node
7035 // name for this new node so that users switch to use the result of this
7036 // new node without having to change the name of the value they use.
7037 NodeDef* copy_op = segment_def->add_node();
7038 copy_op->set_name(copy_op_name);
7039 copy_op->set_op("_CopyFromHostToGpu");
7040 *copy_op->add_input() = shape_op_name + ":0";
7041 tensorflow::DataType type = snode->attr().at("out_type").type();
7042 AddNodeAttr("T", type, copy_op);
7043 AddNodeAttr("out_type", type, copy_op);
7044
7045 // Rename the Shape OP node and add the new name to the set of node names
7046 // for the engine.
7047 snode->set_name(shape_op_name);
7048 subgraph_node_names.insert(shape_op_name);
7049 VLOG(2) << "Add copy node " << copy_op->DebugString();
7050 }
7051 VLOG(2) << "Copying " << snode->name() << " to subgraph";
7052 }
7053 // Update the inputs of the new input nodes to point to placeholder nodes.
7054 for (int i = 0; i < connections->size(); ++i) {
7055 auto& connection = connections->at(i);
7056 if (connection.is_control_edge() || !connection.is_input_edge) continue;
7057 auto snode =
7058 segment_def->mutable_node(old_to_new_id_map[connection.inside_id]);
7059 const string arg_name =
7060 StrCat(IONamePrefixes::kInputPHName, connection.port_number);
7061 VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
7062 << " from " << snode->input(connection.inside_port) << " to "
7063 << arg_name;
7064 snode->set_input(connection.inside_port, arg_name);
7065 }
7066
7067 // Remove control inputs that are not inside the segment.
7068 for (int i = 0; i < segment_def->node_size(); ++i) {
7069 auto snode = segment_def->mutable_node(i);
7070 const int input_size = snode->input_size();
7071 int input_idx = 0;
7072 int actual_input_idx = 0;
7073 while (input_idx < input_size) {
7074 TensorId input = ParseTensorName(snode->input(input_idx));
7075 if (!subgraph_node_names.count(
7076 string(input.first.data(), input.first.size())) &&
7077 !IsEngineInput(input.first)) {
7078 if (input.second == Graph::kControlSlot) {
7079 VLOG(1) << "... removing control inputs " << input.first
7080 << " from subgraph.";
7081 ++input_idx;
7082 continue;
7083 } else {
7084 return errors::InvalidArgument(
7085 "Found non control input outside the segment that is not an "
7086 "engine connection to ",
7087 snode->name(), ": ", input.first);
7088 }
7089 }
7090 if (actual_input_idx != input_idx) {
7091 snode->set_input(actual_input_idx, snode->input(input_idx));
7092 }
7093 ++input_idx;
7094 ++actual_input_idx;
7095 }
7096 for (int remove = input_size - actual_input_idx; remove > 0; --remove) {
7097 snode->mutable_input()->RemoveLast();
7098 }
7099 }
7100 engine_info->engine_name = StrCat(local_scope, engine_info->engine_name);
7101 engine_info->has_int32_input = has_int32_input;
7102 return Status::OK();
7103 }
7104
operator ()(const Edge * out_edge) const7105 bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
7106 if (out_edge->IsControlEdge()) return true;
7107 if (out_edge->src()->type_string() == "Const") {
7108 VLOG(1) << "--> Need to remove output node " << out_edge->src()->name()
7109 << " which is a Const.";
7110 return false;
7111 }
7112 return true;
7113 }
7114
7115 } // namespace convert
7116 } // namespace tensorrt
7117 } // namespace tensorflow
7118
7119 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
7120