• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <cstring>
21 #include <map>
22 #include <memory>
23 #include <set>
24 #include <unordered_map>
25 #include <utility>
26 #include <vector>
27 
28 #include "absl/algorithm/container.h"
29 #include "absl/container/flat_hash_set.h"
30 #include "absl/memory/memory.h"
31 #include "absl/strings/match.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_format.h"
34 #include "absl/strings/string_view.h"
35 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
36 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
37 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
38 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
39 #include "tensorflow/core/common_runtime/graph_constructor.h"
40 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
41 #include "tensorflow/core/framework/node_def_builder.h"
42 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
43 #include "tensorflow/core/framework/tensor_shape.h"
44 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
45 #include "tensorflow/core/framework/types.h"
46 #include "tensorflow/core/graph/algorithm.h"
47 #include "tensorflow/core/graph/graph.h"
48 #include "tensorflow/core/grappler/op_types.h"
49 #include "tensorflow/core/kernels/linalg/einsum_op_impl.h"
50 #include "tensorflow/core/lib/core/errors.h"
51 #include "tensorflow/core/lib/core/status.h"
52 #include "tensorflow/core/lib/strings/numbers.h"
53 #include "tensorflow/core/lib/strings/str_util.h"
54 #include "tensorflow/core/lib/strings/strcat.h"
55 #include "tensorflow/core/platform/logging.h"
56 #include "tensorflow/core/platform/mutex.h"
57 #include "tensorflow/core/platform/protobuf.h"
58 #include "tensorflow/core/platform/tensor_coding.h"
59 #include "tensorflow/core/platform/types.h"
60 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
61 #include "tensorflow/core/public/version.h"
62 #include "tensorflow/core/util/env_var.h"
63 #include "tensorflow/core/util/strided_slice_op.h"
64 
65 #if GOOGLE_CUDA && GOOGLE_TENSORRT
66 #include "third_party/tensorrt/NvInfer.h"
67 #include "third_party/tensorrt/NvInferPlugin.h"
68 
69 // Check if the types are equal. Cast to int first so that failure log message
70 // would work!
71 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
72 
73 #define TFTRT_INTERNAL_ERROR_AT_NODE(node)                           \
74   do {                                                               \
75     return errors::Internal("TFTRT::", __FUNCTION__, ":", __LINE__,  \
76                             " failed to add TRT layer, at: ", node); \
77   } while (0)
78 
79 #define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \
80   do {                                           \
81     if (ptr == nullptr) {                        \
82       TFTRT_INTERNAL_ERROR_AT_NODE(node);        \
83     }                                            \
84   } while (0)
85 
86 namespace tensorflow {
87 namespace tensorrt {
88 namespace convert {
89 
90 using absl::StrAppend;
91 using absl::StrCat;
92 
93 namespace {
94 
95 #define ADD_LAYER(layer_name)              \
96   case nvinfer1::LayerType::k##layer_name: \
97     return #layer_name;
98 
LayerTypeToString(nvinfer1::LayerType layer_type)99 const char* LayerTypeToString(nvinfer1::LayerType layer_type) {
100   switch (layer_type) {
101     ADD_LAYER(CONVOLUTION)
102     ADD_LAYER(FULLY_CONNECTED)
103     ADD_LAYER(ACTIVATION)
104     ADD_LAYER(POOLING)
105     ADD_LAYER(LRN)
106     ADD_LAYER(SCALE)
107     ADD_LAYER(SOFTMAX)
108     ADD_LAYER(DECONVOLUTION)
109     ADD_LAYER(CONCATENATION)
110     ADD_LAYER(ELEMENTWISE)
111     ADD_LAYER(PLUGIN)
112     ADD_LAYER(UNARY)
113     ADD_LAYER(PADDING)
114     ADD_LAYER(SHUFFLE)
115     ADD_LAYER(REDUCE)
116     ADD_LAYER(TOPK)
117     ADD_LAYER(GATHER)
118     ADD_LAYER(MATRIX_MULTIPLY)
119     ADD_LAYER(RAGGED_SOFTMAX)
120     ADD_LAYER(CONSTANT)
121     ADD_LAYER(RNN_V2)
122     ADD_LAYER(IDENTITY)
123     ADD_LAYER(PLUGIN_V2)
124     ADD_LAYER(SLICE)
125     ADD_LAYER(SHAPE)
126     ADD_LAYER(PARAMETRIC_RELU)
127     ADD_LAYER(RESIZE)
128     ADD_LAYER(TRIP_LIMIT)
129     ADD_LAYER(RECURRENCE)
130     ADD_LAYER(ITERATOR)
131     ADD_LAYER(LOOP_OUTPUT)
132     ADD_LAYER(SELECT)
133     ADD_LAYER(FILL)
134 #if IS_TRT_VERSION_GE(8, 0, 0, 0)
135     ADD_LAYER(QUANTIZE)
136     ADD_LAYER(DEQUANTIZE)
137 #else
138     // The TRT IRNNv2Layer has been deprecated in favor of the loop API.
139     ADD_LAYER(RNN)
140 #endif
141   }
142   return "UNKNOWN_LAYER";
143 }
144 
145 #undef ADD_LAYER
146 
147 // Sets the ILayer name in the form of
148 // <engine_name>/<tf_related_part>:<trt_operation_name>.
SetLayerNameHelper(nvinfer1::ILayer * layer,absl::string_view engine_name,absl::string_view tf_name)149 void SetLayerNameHelper(nvinfer1::ILayer* layer, absl::string_view engine_name,
150                         absl::string_view tf_name) {
151   const char* trt_name = LayerTypeToString(layer->getType());
152   layer->setName(
153       absl::StrCat(engine_name, "/", tf_name, ":", trt_name).c_str());
154 }
155 
156 // Returns a string in the form of <sub_op_name><sub_op_instance>.
GetLayerNameSuffix(absl::string_view sub_op_name,absl::optional<int> sub_op_instance)157 std::string GetLayerNameSuffix(absl::string_view sub_op_name,
158                                absl::optional<int> sub_op_instance) {
159   std::string op_suffix(sub_op_name);
160   if (sub_op_instance.has_value()) {
161     op_suffix =
162         absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value()));
163   }
164   return op_suffix;
165 }
166 
167 }  // namespace
168 
IsEngineInput(absl::string_view name)169 bool IsEngineInput(absl::string_view name) {
170   return absl::StartsWith(name, IONamePrefixes::kInputPHName);
171 }
IsEngineOutput(absl::string_view name)172 bool IsEngineOutput(absl::string_view name) {
173   return absl::StartsWith(name, IONamePrefixes::kOutputPHName);
174 }
175 
176 class TFAttrs {
177  public:
TFAttrs(const NodeDef & tf_node)178   explicit TFAttrs(const NodeDef& tf_node) {
179     for (const auto& attr : tf_node.attr()) {
180       attrs_.insert({attr.first, &attr.second});
181     }
182   }
183 
count(const string & key) const184   bool count(const string& key) const { return attrs_.count(key); }
185 
at(const string & key) const186   AttrValue const* at(const string& key) const {
187     if (!attrs_.count(key)) {
188       LOG(FATAL) << "Attribute not found: " << key;
189     }
190     return attrs_.at(key);
191   }
192 
193   template <typename T>
194   T get(const string& key) const;
195 
196   template <typename T>
get(const string & key,const T & default_value) const197   T get(const string& key, const T& default_value) const {
198     return attrs_.count(key) ? this->get<T>(key) : default_value;
199   }
200 
201  private:
202   std::map<string, AttrValue const*> attrs_;
203 };
204 
205 template <>
get(const string & key) const206 string TFAttrs::get<string>(const string& key) const {
207   return this->at(key)->s();
208 }
209 
210 template <>
get(const string & key) const211 std::vector<int64> TFAttrs::get<std::vector<int64>>(const string& key) const {
212   auto attr = this->at(key)->list().i();
213   return std::vector<int64>(attr.begin(), attr.end());
214 }
215 
216 template <>
get(const string & key) const217 std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
218   auto attr = this->at(key)->list().f();
219   return std::vector<float>(attr.begin(), attr.end());
220 }
221 
222 template <>
get(const string & key) const223 nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
224   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
225   TF_CHECK_OK(TfTypeToTrtType(this->at(key)->type(), &trt_dtype));
226   return trt_dtype;
227 }
228 
229 template <>
get(const string & key) const230 DataType TFAttrs::get<DataType>(const string& key) const {
231   return this->at(key)->type();
232 }
233 
234 template <>
get(const string & key) const235 float TFAttrs::get<float>(const string& key) const {
236   return this->at(key)->f();
237 }
238 
239 template <>
get(const string & key) const240 bool TFAttrs::get<bool>(const string& key) const {
241   return this->at(key)->b();
242 }
243 
244 template <>
get(const string & key) const245 int64 TFAttrs::get<int64>(const string& key) const {
246   return this->at(key)->i();
247 }
248 
249 // TODO(laigd): use this utility function in more places.
RemoveBatchDimension(nvinfer1::Dims * dims)250 Status RemoveBatchDimension(nvinfer1::Dims* dims) {
251   if (dims->nbDims < 2) {
252     return errors::InvalidArgument(
253         "Dropping batch dimension requires dims with rank>=2.");
254   }
255   std::copy(dims->d + 1, dims->d + dims->nbDims, dims->d);
256   dims->nbDims--;
257   return Status::OK();
258 }
259 
GetOutputProperties(const grappler::GraphProperties & graph_properties,const Node * node,const int out_port,PartialTensorShape * shape,DataType * dtype)260 void GetOutputProperties(const grappler::GraphProperties& graph_properties,
261                          const Node* node, const int out_port,
262                          PartialTensorShape* shape, DataType* dtype) {
263   if (graph_properties.HasOutputProperties(node->name())) {
264     auto output_params = graph_properties.GetOutputProperties(node->name());
265     auto out_shape = output_params.at(out_port);
266     *dtype = out_shape.dtype();
267     *shape = out_shape.shape();
268   } else {
269     LOG(INFO) << "Unknown output shape" << node->name();
270     *dtype = node->output_type(out_port);
271   }
272 }
273 
GetInputProperties(const grappler::GraphProperties & graph_properties,const Node * node,const int in_port,PartialTensorShape * shape,DataType * dtype)274 void GetInputProperties(const grappler::GraphProperties& graph_properties,
275                         const Node* node, const int in_port,
276                         PartialTensorShape* shape, DataType* dtype) {
277   if (graph_properties.HasInputProperties(node->name())) {
278     auto input_params = graph_properties.GetInputProperties(node->name());
279     auto in_shape = input_params.at(in_port);
280     *dtype = in_shape.dtype();
281     *shape = in_shape.shape();
282   } else {
283     *dtype = node->input_type(in_port);
284   }
285 }
286 
287 // This function checks if a tensor is compatible with TRT.
288 //
289 // We check that the shape and datatype are compatible with TensorRT. We also
290 // return the corresponding trt_dtype, the trt_dims and the batch_size (latter
291 // is only needed in implicit batch mode).
292 //
293 // The return status indicates wether the tensor is compatible.
294 //
295 // For implicit batch mode, when validation_only == false, we also check that
296 // all input dimensions (besides the batch dimension) are known dimensions.
ValidateTensorProperties(const string & producer_node_type,const DataType dtype,const PartialTensorShape & shape,const bool use_implicit_batch,bool validation_only,nvinfer1::DataType * trt_dtype,nvinfer1::Dims * trt_dims,int * batch_size)297 Status ValidateTensorProperties(const string& producer_node_type,
298                                 const DataType dtype,
299                                 const PartialTensorShape& shape,
300                                 const bool use_implicit_batch,
301                                 bool validation_only,
302                                 nvinfer1::DataType* trt_dtype,
303                                 nvinfer1::Dims* trt_dims, int* batch_size) {
304   // Convert data type.
305   TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, trt_dtype));
306 
307   // Convert shape.
308   if (shape.dims() < 0) {
309     return errors::InvalidArgument("Input tensor rank is unknown.");
310   }
311   // Add 1 to maximum rank for implicit batch dim.
312   const int max_rank = nvinfer1::Dims::MAX_DIMS + (use_implicit_batch ? 1 : 0);
313   if (shape.dims() > max_rank) {
314     return errors::OutOfRange("Input tensor rank is greater than ", max_rank);
315   }
316   if (use_implicit_batch && (producer_node_type != "Const") &&
317       (shape.dims() < 1)) {
318     return errors::InvalidArgument(
319         "Scalar input tensor is not supported since the first dimension "
320         "is treated as batch dimension by TRT");
321   }
322   TF_RETURN_IF_ERROR(
323       TensorShapeToTrtDims(shape,
324                            /*ignore_first_dim=*/use_implicit_batch, trt_dims));
325   // Get batch size for tensor if it will not be included the shape.
326   if (use_implicit_batch) {
327     *batch_size = shape.dim_size(0);
328   }
329 
330   // Don't convert empty tensors (dim value of 0).
331   const int first_trt_dim = use_implicit_batch ? 1 : 0;
332   for (int d = first_trt_dim; d < shape.dims(); ++d) {
333     if (shape.dim_size(d) == 0) {
334       return errors::Unimplemented(
335           "Input tensor with shape ", shape.DebugString(),
336           " is an empty tensor, which is not supported by TRT");
337     }
338   }
339 
340   if (validation_only) return Status::OK();
341 
342   // Following checks are only used during TRT engine creation time.
343   if (use_implicit_batch) {
344     for (int d = first_trt_dim; d < shape.dims(); ++d) {
345       if (shape.dim_size(d) < 0) {
346         return errors::InvalidArgument(
347             "Input tensor with shape ", shape.DebugString(),
348             " has an unknown non-batch dimension at dim ", d);
349       }
350     }
351   }
352   return Status::OK();
353 }
354 
GetTrtBroadcastShape(const TRT_TensorOrWeights & operand_l,const TRT_TensorOrWeights & operand_r,const bool check_feasibility,const bool use_implicit_batch,nvinfer1::Dims * operand_l_new_dims,nvinfer1::Dims * operand_r_new_dims)355 Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
356                             const TRT_TensorOrWeights& operand_r,
357                             const bool check_feasibility,
358                             const bool use_implicit_batch,
359                             nvinfer1::Dims* operand_l_new_dims,
360                             nvinfer1::Dims* operand_r_new_dims) {
361   // TensorRT Elementwise op supports broadcast but requires both tensor to be
362   // of Identical rank
363   //
364   // We consider case of:
365   //   1. operand_l to be a Tensor & operand_r to be a Const;
366   //   2. operand_l to be a Tensor & operand_r to be a Tensor;
367   // note: const op const (constant folding) should fallback to TensorFlow
368   //
369   // broadcast scheme:
370   //       T:  1 3 5    (tensor would not have batch dimension)
371   //       W:  1 1 3 1  (weight would have all explicit dimensions)
372   // i. fill in explicit dimensions
373   //    -> T: -1 1 3 5  (we put a -1 for batch dimension)
374   //    -> W:  1 1 3 1
375   // ii. compare broadcast feasibility
376   //
377   // We cannot support the following since TensorRT does not allow manipulation
378   // on batch dimension, we cannot generate output with proper shape
379   //    T: 3 5 1
380   //    W: 1 1 1  1 3 5 1
381   // -> T: 1 1 1 -1 3 5 1
382   // -> W: 1 1 1  1 3 5 1
383   // ***************************************************************************
384   if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
385     return errors::InvalidArgument(
386         "Broadcasting requires at least one of the operands be tensors");
387   }
388 
389   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
390   auto compute_output_dims = [use_implicit_batch](
391                                  const TRT_TensorOrWeights& input,
392                                  int broadcast_num_dims, int* output_dims_array,
393                                  nvinfer1::Dims* output_dims) {
394     const nvinfer1::Dims input_dims = input.GetTrtDims();
395     std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
396     std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
397               output_dims_array + broadcast_num_dims - input_dims.nbDims);
398     if (use_implicit_batch && input.is_tensor()) {
399       const int true_input_dims = input_dims.nbDims + 1;
400       if (true_input_dims < broadcast_num_dims) {
401         return errors::InvalidArgument(
402             "Broadcasting beyond batch dimension is not supported ",
403             "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
404             broadcast_num_dims, ")");
405       }
406       // Set the batch dimension to -1, since batch size is not supposed to
407       // be broadcasted.
408       output_dims_array[0] = -1;
409     }
410     // Copy to output dimensions
411     if (use_implicit_batch) {
412       // Strip batch dimension while copying
413       output_dims->nbDims = broadcast_num_dims - 1;
414       std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
415                 output_dims->d);
416     } else {
417       output_dims->nbDims = broadcast_num_dims;
418       std::copy(output_dims_array, output_dims_array + broadcast_num_dims,
419                 output_dims->d);
420     }
421 
422     return Status::OK();
423   };
424 
425   // Compute the output dimensions.
426   const int broadcast_num_dims =
427       std::max(operand_l.GetTrtDims().nbDims +
428                    (use_implicit_batch && operand_l.is_tensor()),
429                operand_r.GetTrtDims().nbDims +
430                    (use_implicit_batch && operand_r.is_tensor()));
431   int output_l[max_nb_dims], output_r[max_nb_dims];
432   TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
433                                          output_l, operand_l_new_dims));
434   TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
435                                          output_r, operand_r_new_dims));
436 
437   // Compare broadcast feasibility
438   if (check_feasibility) {
439     for (int i = 0; i < broadcast_num_dims; ++i) {
440       if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) {
441         // If the condition is true then we are in explicit batch mode and (at
442         // least) one of the input dimensions are unknown. In other words we
443         // are in dynamic shape mode. During conversion time we only see -1 for
444         // the unknown shapes, therefore we cannot decide on the feasibility of
445         // broadcast over the unknown dimensions. Therefore we just continue for
446         // the next dimension. In dynamic shape mode TRT can only check the
447         // feasibility of the broadcast when the actual input dimensions are
448         // specified by SetTrtEngineInputs and the inference job is launched by
449         // TrtEnque.
450         continue;
451       }
452       if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
453           (output_r[i] != 1)) {
454         return errors::InvalidArgument("Infeasible broadcast scheme (",
455                                        "batch_dim: ", output_l[0], ", ",
456                                        DebugString(*operand_l_new_dims), " vs ",
457                                        "batch_dim: ", output_r[0], ", ",
458                                        DebugString(*operand_r_new_dims), ")");
459       }
460     }
461   }
462   return Status::OK();
463 }
464 
465 // Prepares a dynamic shape tensor for broadcast by adding leading 1 dimensions.
DynamicBroadcast(ITensorProxyPtr operand,OpConverterParams * params,ITensorProxyPtr * output,int broadcasted_nbDims)466 Status DynamicBroadcast(ITensorProxyPtr operand, OpConverterParams* params,
467                         ITensorProxyPtr* output, int broadcasted_nbDims) {
468   int operand_nbDims = operand->getDimensions().nbDims;
469   if (broadcasted_nbDims > operand_nbDims) {
470     if (params->validation_only) return Status::OK();
471     int n_extra_dims = broadcasted_nbDims - operand_nbDims;
472     VLOG(2) << "Dynamic broadcast adding " << n_extra_dims << " leading 1s";
473     TF_RETURN_IF_ERROR(params->converter->DynamicReshape(
474         operand, {std::make_pair(0, operand_nbDims)}, params, output,
475         {n_extra_dims}));
476   } else {
477     *output = operand;
478   }
479   return Status::OK();
480 }
481 
BroadcastWeights(std::unique_ptr<TRT_TensorOrWeights> & p,nvinfer1::Dims broadcasted_dims)482 Status BroadcastWeights(std::unique_ptr<TRT_TensorOrWeights>& p,
483                         nvinfer1::Dims broadcasted_dims) {
484   if (!p->is_weights()) return errors::Internal("Weight input expected");
485   if (p->GetTrtDims().nbDims != broadcasted_dims.nbDims) {
486     TRT_ShapedWeights weights(p->weights());
487     TF_RETURN_IF_ERROR(weights.SetShape(broadcasted_dims));
488     p = std::make_unique<TRT_TensorOrWeights>(weights);
489   }
490   return Status::OK();
491 }
492 
ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights> & operand,nvinfer1::Dims broadcasted_dims,OpConverterParams * params)493 Status ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights>& operand,
494                       nvinfer1::Dims broadcasted_dims,
495                       OpConverterParams* params) {
496   if (operand->is_weights()) {
497     TF_RETURN_IF_ERROR(BroadcastWeights(operand, broadcasted_dims));
498   } else {
499     ITensorProxyPtr tensor = nullptr;
500     auto is_static_shuffle_compatible = [](nvinfer1::Dims dims) {
501       return std::count(dims.d, dims.d + dims.nbDims, -1) <= 1;
502     };
503     if (is_static_shuffle_compatible(broadcasted_dims)) {
504       TF_RETURN_IF_ERROR(PrepareTensorForShape(
505           params->converter, *operand, broadcasted_dims,
506           params->validation_only, &tensor, params->node_def));
507     } else {
508       TF_RETURN_IF_ERROR(DynamicBroadcast(operand->tensor(), params, &tensor,
509                                           broadcasted_dims.nbDims));
510     }
511     operand = std::make_unique<TRT_TensorOrWeights>(tensor);
512   }
513   return Status::OK();
514 }
515 
516 // Inserts leading 1 dimensions so that both operands have the same rank.
517 // Note: In implicit batch mode, weights' shape can include an explicit 1 batch
518 // dimension. The broadcasted shape might loose this leading batch dim, because
519 // the broadcasted shape does not include the implicit batch dim.
520 // TODO(tfeher): Other code blocks that use GetTrtBroadcastShape need to be
521 // fixed to use this routine to handle dynamic inputs. Eventually,
522 // GetTrtBroadcastShape should only be used by this routine.
BroadcastTensors(std::unique_ptr<TRT_TensorOrWeights> & operand_l,std::unique_ptr<TRT_TensorOrWeights> & operand_r,bool check_feasibility,OpConverterParams * params)523 Status BroadcastTensors(std::unique_ptr<TRT_TensorOrWeights>& operand_l,
524                         std::unique_ptr<TRT_TensorOrWeights>& operand_r,
525                         bool check_feasibility, OpConverterParams* params) {
526   nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
527   TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
528       *operand_l, *operand_r, check_feasibility, params->use_implicit_batch,
529       &broadcasted_dims_l, &broadcasted_dims_r));
530 
531   if (params->validation_only) return Status::OK();
532 
533   TF_RETURN_IF_ERROR(ApplyBroadcast(operand_l, broadcasted_dims_l, params));
534   TF_RETURN_IF_ERROR(ApplyBroadcast(operand_r, broadcasted_dims_r, params));
535 
536   return Status::OK();
537 }
538 
CreateConstantLayer(const TRT_ShapedWeights & weights,const nvinfer1::Dims & dims)539 ITensorProxyPtr Converter::CreateConstantLayer(const TRT_ShapedWeights& weights,
540                                                const nvinfer1::Dims& dims) {
541   nvinfer1::Weights trt_weights = weights.GetTrtWeights();
542   nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
543   if (!layer) return nullptr;
544   SetLayerName(layer, "_tftrt_constant_",
545                std::to_string(next_constant_layer_id_));
546   next_constant_layer_id_++;
547   ITensorProxyPtr trt_tensor = layer->getOutput(0);
548   return trt_tensor;
549 }
550 
551 // Creates a scalar constant and fills with value.
552 template <typename T>
CreateScalarConstant(OpConverterParams * params,T value,ITensorProxyPtr * tensor,nvinfer1::DataType trt_type=nvinfer1::DataType::kINT32,const nvinfer1::Dims & dims={1, {1}})553 Status CreateScalarConstant(
554     OpConverterParams* params, T value, ITensorProxyPtr* tensor,
555     nvinfer1::DataType trt_type = nvinfer1::DataType::kINT32,
556     const nvinfer1::Dims& dims = {1, {1}}) {
557   TRT_ShapedWeights weights =
558       params->weight_store->GetTempWeights(trt_type, dims);
559   TF_RETURN_IF_ERROR(weights.SetValues(value));
560   *tensor = params->converter->CreateConstantLayer(weights, dims);
561   TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
562   return Status::OK();
563 }
564 
565 // Creates a constant with the same rank as dims, where each dimension has
566 // size = 1.
CreateBroadcastableScalarConstant(OpConverterParams * params,float value,const nvinfer1::Dims & dims,ITensorProxyPtr * tensor,const char * dtype_attr_name="T")567 Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value,
568                                          const nvinfer1::Dims& dims,
569                                          ITensorProxyPtr* tensor,
570                                          const char* dtype_attr_name = "T") {
571   nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT;  // Default to FP32.
572   TFAttrs attrs(params->node_def);
573   if (attrs.count(dtype_attr_name)) {
574     DataType dtype = attrs.get<DataType>(dtype_attr_name);
575     TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, &trt_type));
576   }
577 
578   // In order to be broadcastable, the number of dims has to match.
579   nvinfer1::Dims broadcastable_dims(dims);
580   for (int i = 0; i < broadcastable_dims.nbDims; i++) {
581     broadcastable_dims.d[i] = 1;
582   }
583   return CreateScalarConstant(params, value, tensor, trt_type,
584                               broadcastable_dims);
585 }
586 
587 // The function concatenates tensors on the first axis. This can be used to
588 // create a shape tensor from individual dimension sizes.
ConcatenateTensors(OpConverterParams * params,const std::vector<ITensorProxyPtr> input_tensors,absl::optional<int> op_instance=absl::nullopt)589 StatusOr<ITensorProxyPtr> ConcatenateTensors(
590     OpConverterParams* params, const std::vector<ITensorProxyPtr> input_tensors,
591     absl::optional<int> op_instance = absl::nullopt) {
592   std::vector<nvinfer1::ITensor*> trt_input_tensors;
593   for (const auto& t : input_tensors) {
594     trt_input_tensors.push_back(t->trt_tensor());
595   }
596   nvinfer1::IConcatenationLayer* layer =
597       params->converter->network()->addConcatenation(
598           static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
599           input_tensors.size());
600   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.op());
601   params->converter->SetLayerName(layer, params->node_def.name(),
602                                   "concat_shapes", op_instance);
603   layer->setAxis(0);
604   return ITensorProxyPtr(layer->getOutput(0));
605 }
606 
607 // Convert an axis from TF format to TRT format while validating. TF format
608 // includes the batch dimension, while TRT does not if implicit batching is used
609 // (i.e. for tensors). TF can also use negative indices.
ConvertAxis(int tf_axis,int trt_nb_dims,absl::string_view node_name,bool use_implicit_batch,int * trt_axis)610 Status ConvertAxis(int tf_axis, int trt_nb_dims, absl::string_view node_name,
611                    bool use_implicit_batch, int* trt_axis) {
612   const int tf_nb_dims = trt_nb_dims + (use_implicit_batch ? 1 : 0);
613   // Check bounds.
614   if (tf_axis < -tf_nb_dims || tf_axis >= tf_nb_dims) {
615     return errors::InvalidArgument(
616         "Axis value of ", tf_axis, " is out of bounds, must be in range [",
617         -tf_nb_dims, ", ", tf_nb_dims, "), at ", node_name);
618   }
619   // Make negative axis positive.
620   if (tf_axis < 0) tf_axis += tf_nb_dims;
621   // Don't allow axis to be the batch dimension.
622   if (use_implicit_batch && tf_axis == 0) {
623     return errors::Unimplemented(
624         "TensorRT does not allow manipulation of the batch dimension, at ",
625         node_name);
626   }
627   // Remove batch dimension if it is implicit.
628   *trt_axis = use_implicit_batch ? tf_axis - 1 : tf_axis;
629   return Status::OK();
630 }
631 
DimsEqual(const nvinfer1::Dims & dim_l,const nvinfer1::Dims & dim_r)632 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
633                       const nvinfer1::Dims& dim_r) {
634   if (dim_l.nbDims != dim_r.nbDims) {
635     return false;
636   }
637   for (int i = 0; i < dim_l.nbDims; i++) {
638     if (dim_l.d[i] != dim_r.d[i]) {
639       return false;
640     }
641   }
642   return true;
643 }
644 
AllLengthsEqual(const std::vector<std::vector<int>> & inputs)645 bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
646   if (inputs.size() == 0) return true;
647   int length = inputs.at(0).size();
648   for (int i = 1; i < inputs.size(); i++) {
649     if (inputs.at(i).size() != length) return false;
650   }
651   return true;
652 }
653 
GetTrtDimsForTensor(const Tensor & tensor)654 inline nvinfer1::Dims GetTrtDimsForTensor(const Tensor& tensor) {
655   nvinfer1::Dims dims;
656   dims.nbDims = tensor.dims();
657   for (int i = 0; i < dims.nbDims; i++) {
658     dims.d[i] = tensor.dim_size(i);
659   }
660   return dims;
661 }
662 
Prod(const nvinfer1::Dims & dims)663 int64_t Prod(const nvinfer1::Dims& dims) {
664   int64_t count = 1;
665   for (int d = 0; d < dims.nbDims; ++d) {
666     count *= dims.d[d];
667   }
668   return count;
669 }
670 
671 // Returns total number of elements in an ITensor dimension.
672 // Returns 1 if the number of dims is 0 (the total number is fully determined by
673 // the batch size).
674 // Returns -1 if any dimension is known.
TrtTensorDimsNumElements(const nvinfer1::Dims & dims)675 int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims) {
676   if (!HasStaticShape(dims)) return -1;
677   return Prod(dims);
678 }
679 
DimsHaveSameSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)680 bool DimsHaveSameSize(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
681   return TrtTensorDimsNumElements(lhs) == TrtTensorDimsNumElements(rhs);
682 }
683 
684 // Returns whether both dimensions are fully specified and the total number of
685 // elements equals.
AreDimsStaticWithSameSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)686 bool AreDimsStaticWithSameSize(const nvinfer1::Dims& lhs,
687                                const nvinfer1::Dims& rhs) {
688   if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
689   return DimsHaveSameSize(lhs, rhs);
690 }
691 
AreDimsStaticWithDifferentSize(const nvinfer1::Dims & lhs,const nvinfer1::Dims & rhs)692 bool AreDimsStaticWithDifferentSize(const nvinfer1::Dims& lhs,
693                                     const nvinfer1::Dims& rhs) {
694   if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
695   return !DimsHaveSameSize(lhs, rhs);
696 }
697 
CreateSamePadding(const nvinfer1::Dims & stride,const nvinfer1::Dims & kernel,const std::vector<int64_t> & input_dims)698 static std::vector<std::pair<int, int>> CreateSamePadding(
699     const nvinfer1::Dims& stride, const nvinfer1::Dims& kernel,
700     const std::vector<int64_t>& input_dims) {
701   std::vector<std::pair<int, int>> padding(input_dims.size());
702   CHECK_EQ(stride.nbDims, input_dims.size());  // TODO(jie): N+C? NC+?
703 
704   for (size_t i = 0; i < input_dims.size(); ++i) {
705     // Formula to calculate the padding
706     int p = ((input_dims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
707             input_dims[i];
708     p = (p > 0) ? p : 0;
709 
710     // Right precedence padding, like in TensorFlow
711     int left = p / 2;
712     int right = p - left;
713 
714     VLOG(2) << "PADDING_" << i << " pre: " << left << ", post: " << right
715             << "paras: " << input_dims[i] << ", " << stride.d[i] << ", "
716             << "kernel: " << kernel.d[i];
717     padding[i] = {left, right};
718   }
719   return padding;
720 }
721 
GetCommonNameScope(const string & op_name_a,const string & op_name_b)722 string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
723   size_t last_scope_separator = 0;
724   const size_t min_size = std::min(op_name_a.size(), op_name_b.size());
725   for (size_t i = 0; i < min_size; ++i) {
726     if (op_name_a[i] != op_name_b[i]) break;
727     if (op_name_a[i] == '/') last_scope_separator = i + 1;
728   }
729   return op_name_a.substr(0, last_scope_separator);
730 }
731 
732 // Verifies that shapes of the given inputs match after masking the specified
733 // dimension.
VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,int masked_dim,absl::string_view node_name)734 Status VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,
735                          int masked_dim, absl::string_view node_name) {
736   size_t num_inputs = inputs.size();
737   if (num_inputs <= 1) return Status::OK();
738 
739   const nvinfer1::Dims dims_0 = inputs.at(0).GetTrtDims();
740   for (size_t i = 1; i < num_inputs; ++i) {
741     const nvinfer1::Dims dim_i = inputs.at(i).GetTrtDims();
742     if (dim_i.nbDims != dims_0.nbDims) {
743       return errors::InvalidArgument(
744           "Received inputs with inconsistent rank, at ", node_name);
745     }
746     for (size_t j = 0; j < dims_0.nbDims; ++j) {
747       // Dynamic dimensions will be verified at runtime.
748       if (dim_i.d[j] == -1 || dims_0.d[j] == -1) continue;
749       if (dim_i.d[j] != dims_0.d[j] && j != masked_dim) {
750         return errors::InvalidArgument(
751             "Received inputs with inconsistent shape, at ", node_name);
752       }
753     }
754   }
755   return Status::OK();
756 }
757 
TRT_ShapedWeights(nvinfer1::DataType type)758 TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type) : type_(type) {
759   shape_.nbDims = 0;
760   shape_.d[0] = 0;
761 }
762 
TRT_ShapedWeights(nvinfer1::DataType type,nvinfer1::Dims dims,Tensor tensor)763 TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type,
764                                      nvinfer1::Dims dims, Tensor tensor)
765     : shape_(dims), type_(type), tensor_(tensor) {
766   if (dims.nbDims == 0) {
767     DCHECK(dims.d[0] == 0 || dims.d[0] == 1);
768   }
769 }
770 
TRT_ShapedWeights(const TRT_ShapedWeights & rhs)771 TRT_ShapedWeights::TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
772     : shape_(rhs.shape_), type_(rhs.type_), tensor_(rhs.tensor_) {}
773 
count(nvinfer1::Dims dims)774 int64_t TRT_ShapedWeights::count(nvinfer1::Dims dims) {
775   if (dims.nbDims == 0) {
776     assert(dims.d[0] == 0 || dims.d[0] == 1);
777     return dims.d[0];
778   }
779   return Prod(dims);
780 }
781 
GetTrtWeights() const782 nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
783   return nvinfer1::Weights{type_, GetValues(), count()};
784 }
785 
786 template <typename T>
SetValues(T value)787 Status TRT_ShapedWeights::SetValues(T value) {
788   switch (type_) {
789     case nvinfer1::DataType::kFLOAT: {
790       float* ptr = tensor_.flat<float>().data();
791       std::fill(ptr, ptr + count(), value);
792       break;
793     }
794     case nvinfer1::DataType::kHALF: {
795       Eigen::half* ptr = tensor_.flat<Eigen::half>().data();
796       std::fill(ptr, ptr + count(), Eigen::half(value));
797       break;
798     }
799     case nvinfer1::DataType::kINT32: {
800       int32* ptr = tensor_.flat<int32>().data();
801       std::fill(ptr, ptr + count(), value);
802       break;
803     }
804     default:
805       return errors::InvalidArgument("Unsupported data type ",
806                                      tensorflow::tensorrt::DebugString(type_));
807   }
808   return Status::OK();
809 }
810 
SetShape(nvinfer1::Dims dims)811 Status TRT_ShapedWeights::SetShape(nvinfer1::Dims dims) {
812   if (this->count() != TRT_ShapedWeights::count(dims)) {
813     VLOG(2) << "Changing shape from "
814             << tensorflow::tensorrt::DebugString(shape_) << ", to "
815             << tensorflow::tensorrt::DebugString(dims);
816     return errors::Internal("SetShape would change number of elements");
817   }
818   shape_ = dims;
819   return Status::OK();
820 }
821 
size_bytes() const822 size_t TRT_ShapedWeights::size_bytes() const {
823   size_t data_type_size = -1;
824   switch (type_) {
825     case nvinfer1::DataType::kFLOAT:
826     case nvinfer1::DataType::kINT32:
827       data_type_size = 4;
828       break;
829     case nvinfer1::DataType::kHALF:
830       data_type_size = 2;
831       break;
832     case nvinfer1::DataType::kINT8:
833     case nvinfer1::DataType::kBOOL:
834       data_type_size = 1;
835       break;
836   }
837   return this->count() * data_type_size;
838 }
839 
DebugString() const840 string TRT_ShapedWeights::DebugString() const {
841   return StrCat(
842       "TRT_ShapedWeights(shape=", tensorflow::tensorrt::DebugString(shape_),
843       ", type=", tensorflow::tensorrt::DebugString(type_),
844       ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
845 }
846 
TRT_TensorOrWeights(ITensorProxyPtr tensor)847 TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor)
848     : tensor_proxy_ptr_(tensor), initialized_(true), is_tensor_(true) {}
849 
TRT_TensorOrWeights(ITensorProxyPtr tensor,int batch_size)850 TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size)
851     : tensor_proxy_ptr_(tensor),
852       batch_size_(batch_size),
853       initialized_(true),
854       is_tensor_(true) {}
855 
TRT_TensorOrWeights(nvinfer1::ITensor * tensor,int batch_size)856 TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor,
857                                          int batch_size)
858     : tensor_proxy_ptr_(tensor),
859       batch_size_(batch_size),
860       initialized_(true),
861       is_tensor_(true) {}
862 
TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,const nvinfer1::Dims & trt_dims,int batch_size)863 TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
864                                          const nvinfer1::Dims& trt_dims,
865                                          int batch_size)
866     : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)),
867       batch_size_(batch_size),
868       initialized_(true),
869       is_tensor_(true) {}
870 
TRT_TensorOrWeights(const TRT_ShapedWeights & weights)871 TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
872     : weights_(weights), initialized_(true), is_tensor_(false) {}
873 
TRT_TensorOrWeights(const TRT_TensorOrWeights & rhs)874 TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
875     : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_),
876       batch_size_(rhs.batch_size_),
877       weights_(rhs.weights_),
878       initialized_(rhs.initialized_),
879       is_tensor_(rhs.is_tensor_) {}
880 
operator =(const TRT_TensorOrWeights & rhs)881 void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) {
882   tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_;
883   batch_size_ = rhs.batch_size_;
884   weights_ = rhs.weights_;
885   initialized_ = rhs.initialized_;
886   is_tensor_ = rhs.is_tensor_;
887 }
888 
tensor() const889 ITensorProxyPtr TRT_TensorOrWeights::tensor() const {
890   CHECK(is_tensor());
891   return tensor_proxy_ptr_;
892 }
893 
GetTrtDims() const894 nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
895   if (is_tensor()) {
896     return tensor()->getDimensions();
897   } else {
898     return weights().shape_;
899   }
900 }
901 
GetTfType(DataType * tf_type) const902 Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
903   if (is_tensor()) {
904     nvinfer1::DataType trt_type = tensor()->getType();
905     return TrtTypeToTfType(trt_type, tf_type);
906   }
907   if (is_weights()) {
908     *tf_type = weights().GetTensor().dtype();
909     return Status::OK();
910   }
911   return errors::Internal("The object is probably not initialized");
912 }
913 
DebugString() const914 string TRT_TensorOrWeights::DebugString() const {
915   string output = "TRT_TensorOrWeights(type=";
916   if (is_tensor()) {
917     StrAppend(&output, "tensor=", tensorflow::tensorrt::DebugString(tensor()),
918               ", batch_size=", batch_size_);
919   } else {
920     StrAppend(&output, "weights=", weights_.DebugString());
921   }
922   StrAppend(&output, ")");
923   return output;
924 }
925 
926 // Perform 5 dimensional reorder of data on CPU
927 // This is done once at convert time and does not affect GPU inference perf
928 // Example: reorder NDHWC (Tensorflow) -> NCDHW (TensorRT)
929 template <typename T>
Reorder5(const nvinfer1::Dims & shape,const T * idata,const nvinfer1::Dims & istrides,T * odata,const nvinfer1::Dims & ostrides)930 void Reorder5(const nvinfer1::Dims& shape, const T* idata,
931               const nvinfer1::Dims& istrides, T* odata,
932               const nvinfer1::Dims& ostrides) {
933   for (int k = 0; k < shape.d[0]; ++k) {
934     for (int c = 0; c < shape.d[1]; ++c) {
935       for (int d = 0; d < shape.d[2]; ++d) {
936         for (int r = 0; r < shape.d[3]; ++r) {
937           for (int s = 0; s < shape.d[4]; ++s) {
938             odata[k * ostrides.d[0] + c * ostrides.d[1] + d * ostrides.d[2] +
939                   r * ostrides.d[3] + s * ostrides.d[4]] =
940                 idata[k * istrides.d[0] + c * istrides.d[1] +
941                       d * istrides.d[2] + r * istrides.d[3] +
942                       s * istrides.d[4]];
943           }
944         }
945       }
946     }
947   }
948 }
949 
950 // TODO(jie): reorder4 & reorder2 should be merged?
951 // TODO(aaroey): fix the order of parameters.
952 template <typename T>
Reorder4(const nvinfer1::Dims4 & shape,const T * idata,const nvinfer1::Dims4 & istrides,T * odata,const nvinfer1::Dims4 & ostrides)953 void Reorder4(const nvinfer1::Dims4& shape, const T* idata,
954               const nvinfer1::Dims4& istrides, T* odata,
955               const nvinfer1::Dims4& ostrides) {
956   for (int n = 0; n < shape.d[0]; ++n) {
957     for (int c = 0; c < shape.d[1]; ++c) {
958       for (int h = 0; h < shape.d[2]; ++h) {
959         for (int w = 0; w < shape.d[3]; ++w) {
960           odata[n * ostrides.d[0] + c * ostrides.d[1] + h * ostrides.d[2] +
961                 w * ostrides.d[3]] =
962               idata[n * istrides.d[0] + c * istrides.d[1] + h * istrides.d[2] +
963                     w * istrides.d[3]];
964         }
965       }
966     }
967   }
968 }
969 
970 template <typename T>
Reorder2(const nvinfer1::DimsHW & shape,const T * idata,const nvinfer1::DimsHW & istrides,T * odata,const nvinfer1::DimsHW & ostrides)971 void Reorder2(const nvinfer1::DimsHW& shape, const T* idata,
972               const nvinfer1::DimsHW& istrides, T* odata,
973               const nvinfer1::DimsHW& ostrides) {
974   for (int h = 0; h < shape.h(); ++h) {
975     for (int w = 0; w < shape.w(); ++w) {
976       odata[h * ostrides.h() + w * ostrides.w()] =
977           idata[h * istrides.h() + w * istrides.w()];
978     }
979   }
980 }
981 
982 // TODO(jie): fallback to tensorflow!!
ReorderCKtoKC(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights)983 void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
984                    TRT_ShapedWeights* oweights) {
985   const int c = iweights.shape_.d[0];
986   const int k = iweights.shape_.d[1];
987   oweights->shape_.d[0] = k;
988   oweights->shape_.d[1] = c;
989   const nvinfer1::DimsHW istrides = {1, k};
990   const nvinfer1::DimsHW ostrides = {c, 1};
991   switch (iweights.TrtDType()) {
992     case nvinfer1::DataType::kFLOAT: {
993       Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
994                istrides, static_cast<float*>(oweights->GetValues()), ostrides);
995       break;
996     }
997     case nvinfer1::DataType::kHALF: {
998       Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
999                istrides, static_cast<Eigen::half*>(oweights->GetValues()),
1000                ostrides);
1001       break;
1002     }
1003     default:
1004       LOG(FATAL) << "Unsupported type in reorder expected fp32 or fp16 but got "
1005                  << DebugString(iweights.TrtDType());
1006   }
1007 }
1008 
ReorderRSCKToKCRS(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,const int num_groups)1009 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
1010                        TRT_ShapedWeights* oweights, const int num_groups) {
1011   CHECK(iweights.TrtDType() == oweights->TrtDType());
1012   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
1013   // K indexes over output channels, C over input channels, and R and S over the
1014   // height and width of the convolution
1015   const int r = iweights.shape_.d[0];
1016   const int s = iweights.shape_.d[1];
1017   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
1018   const int c = iweights.shape_.d[2] / num_groups;
1019   const int k = iweights.shape_.d[3] * num_groups;
1020   VLOG(2) << "num_groups: " << num_groups << "c" << iweights.shape_.d[2]
1021           << " then " << c << "k" << iweights.shape_.d[3] << " then " << k
1022           << "r" << iweights.shape_.d[0] << " then " << r << "s"
1023           << iweights.shape_.d[1] << " then " << s;
1024   oweights->shape_.d[0] = k / num_groups;
1025   oweights->shape_.d[1] = c * num_groups;
1026   oweights->shape_.d[2] = r;
1027   oweights->shape_.d[3] = s;
1028   const nvinfer1::Dims4 istrides = {1, k, s * k * c, c * k};
1029   const nvinfer1::Dims4 ostrides = {c * r * s, r * s, s, 1};
1030   switch (iweights.TrtDType()) {
1031     case nvinfer1::DataType::kFLOAT: {
1032       Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
1033                istrides, static_cast<float*>(oweights->GetValues()), ostrides);
1034       break;
1035     }
1036     case nvinfer1::DataType::kHALF: {
1037       Reorder4({k, c, r, s},
1038                static_cast<Eigen::half const*>(iweights.GetValues()), istrides,
1039                static_cast<Eigen::half*>(oweights->GetValues()), ostrides);
1040       break;
1041     }
1042 
1043     default:
1044       LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
1045                  << DebugString(iweights.TrtDType());
1046   }
1047 }
1048 
1049 // Initialize a Dims object with arbitrary dimension
InitDimsN(std::initializer_list<int> list)1050 nvinfer1::Dims InitDimsN(std::initializer_list<int> list) {
1051   nvinfer1::Dims dim;
1052   dim.nbDims = list.size();
1053   std::copy(list.begin(), list.end(), dim.d);
1054   return dim;
1055 }
1056 
1057 // Reorder 3D convolution weights from TF to TRT
ReorderDRSCKToKCDRS(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,const int num_groups)1058 void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
1059                          TRT_ShapedWeights* oweights, const int num_groups) {
1060   DCHECK(iweights.TrtDType() == oweights->TrtDType());
1061   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
1062   // K indexes over output channels, C over input channels, and R, S, D over the
1063   // height, width, depth
1064   const int d = iweights.shape_.d[0];
1065   const int r = iweights.shape_.d[1];
1066   const int s = iweights.shape_.d[2];
1067   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
1068   const int c = iweights.shape_.d[3] / num_groups;
1069   const int k = iweights.shape_.d[4] * num_groups;
1070 
1071   VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.shape_.d[3]
1072           << " becomes " << c << ", k: " << iweights.shape_.d[4] << " becomes "
1073           << k << ", d: " << d << ", r: " << r << ", s: " << s;
1074 
1075   oweights->shape_.d[0] = iweights.shape_.d[4];  // k / num_groups;
1076   oweights->shape_.d[1] = iweights.shape_.d[3];  // c * num_groups;
1077   oweights->shape_.d[2] = d;
1078   oweights->shape_.d[3] = r;
1079   oweights->shape_.d[4] = s;
1080 
1081   nvinfer1::Dims shape =
1082       InitDimsN({k, c, d, r, s});  // KCDRS shape (same as output)
1083 
1084   nvinfer1::Dims ostrides =
1085       InitDimsN({c * d * r * s, d * r * s, r * s, s,
1086                  1});  // Output = KCDRS = k*CDRS + c*DRS + d*RS + r*S + s
1087 
1088   nvinfer1::Dims istrides =
1089       InitDimsN({1, k, r * s * c * k, s * c * k,
1090                  c * k});  // Input = DRSCK = k*1 + c*K + d*RSCK + r*SCK + s*CK
1091 
1092   switch (iweights.TrtDType()) {
1093     case nvinfer1::DataType::kFLOAT: {
1094       Reorder5(shape, static_cast<float const*>(iweights.GetValues()), istrides,
1095                static_cast<float*>(oweights->GetValues()), ostrides);
1096       break;
1097     }
1098     case nvinfer1::DataType::kHALF: {
1099       Reorder5(shape, static_cast<Eigen::half const*>(iweights.GetValues()),
1100                istrides, static_cast<Eigen::half*>(oweights->GetValues()),
1101                ostrides);
1102       break;
1103     }
1104     default:
1105       LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
1106                  << DebugString(iweights.TrtDType());
1107   }
1108 }
1109 
GetTempWeights(nvinfer1::DataType trt_dtype,const nvinfer1::Dims & dims)1110 TRT_ShapedWeights TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype,
1111                                                  const nvinfer1::Dims& dims) {
1112   TensorShape shape;
1113   DataType tf_dtype;
1114   // TODO(laigd): make it return a status.
1115   TF_CHECK_OK(TensorShapeUtils::MakeShape(dims.d, dims.nbDims, &shape));
1116   TF_CHECK_OK(TrtTypeToTfType(trt_dtype, &tf_dtype));
1117   // TODO(jie): check weights size_bytes. 0 means type error
1118   Tensor tensor(tf_dtype, shape);
1119   TRT_ShapedWeights weights(trt_dtype, dims, tensor);
1120   store_.emplace_back(std::move(tensor));
1121   return weights;
1122 }
1123 
OpConverterParams(const NodeDef & node_def,const std::vector<TRT_TensorOrWeights> & inputs,std::vector<TRT_TensorOrWeights> * outputs,TrtWeightStore * weight_store,TrtPrecisionMode precision_mode,bool use_calibration,bool use_implicit_batch)1124 OpConverterParams::OpConverterParams(
1125     const NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs,
1126     std::vector<TRT_TensorOrWeights>* outputs, TrtWeightStore* weight_store,
1127     TrtPrecisionMode precision_mode, bool use_calibration,
1128     bool use_implicit_batch)
1129     : node_def(node_def),
1130       inputs(inputs),
1131       outputs(outputs),
1132       validation_only(true),
1133       weight_store(weight_store),
1134       precision_mode(precision_mode),
1135       use_calibration(use_calibration),
1136       use_implicit_batch(use_implicit_batch) {}
1137 
OpConverterParams(Converter * converter,const NodeDef & node_def,const std::vector<TRT_TensorOrWeights> & inputs,std::vector<TRT_TensorOrWeights> * outputs,TrtWeightStore * weight_store)1138 OpConverterParams::OpConverterParams(
1139     Converter* converter, const NodeDef& node_def,
1140     const std::vector<TRT_TensorOrWeights>& inputs,
1141     std::vector<TRT_TensorOrWeights>* outputs, TrtWeightStore* weight_store)
1142     : converter(converter),
1143       node_def(node_def),
1144       inputs(inputs),
1145       outputs(outputs),
1146       validation_only(false),
1147       weight_store(weight_store),
1148       precision_mode(converter->precision_mode()),
1149       use_calibration(converter->use_calibration()),
1150       use_implicit_batch(converter->use_implicit_batch()) {}
1151 
1152 const std::set<string>* TrtNodeValidator::quantize_ops = new std::set<string>{
1153     "QuantizeAndDequantizeV2",
1154     "QuantizeAndDequantizeV3",
1155     "FakeQuantWithMinMaxVars",
1156     "FakeQuantWithMinMaxArgs",
1157 };
1158 
IsQuantizeAndDequantizeOp(const Node * node)1159 bool IsQuantizeAndDequantizeOp(const Node* node) {
1160   return TrtNodeValidator::quantize_ops->count(node->def().op()) != 0;
1161 }
1162 
TrtNodeValidator(const grappler::GraphProperties & graph_properties,TrtPrecisionMode precision_mode,bool use_calibration,bool use_implicit_batch)1163 TrtNodeValidator::TrtNodeValidator(
1164     const grappler::GraphProperties& graph_properties,
1165     TrtPrecisionMode precision_mode, bool use_calibration,
1166     bool use_implicit_batch)
1167     : graph_properties_(graph_properties),
1168       precision_mode_(precision_mode),
1169       use_calibration_(use_calibration),
1170       use_implicit_batch_(use_implicit_batch) {
1171   RegisterOpValidators();
1172 }
1173 
ConvertToTensorOrWeights(const NodeDef & node_def,int output_port,TRT_TensorOrWeights * tensor_or_weights)1174 Status TrtNodeValidator::ConvertToTensorOrWeights(
1175     const NodeDef& node_def, int output_port,
1176     TRT_TensorOrWeights* tensor_or_weights) {
1177   if (node_def.op() == "Const") {
1178     if (output_port != 0) {
1179       return errors::InvalidArgument("Const node should only have one output.");
1180     }
1181     // The output of the conversion will be used as input to other nodes to
1182     // determine whether TRT supports those nodes. If it cannot convert the
1183     // Const, it's very likely we cannot treat it as a tensor and make it an
1184     // input to the TRT network, since TRT removes the first dimension and
1185     // treats it as batch size. Also, it's not likely that the converter can
1186     // support the op, and performance may suffer even if it can, so we just
1187     // simply return error if the conversion fails.
1188     std::vector<TRT_TensorOrWeights> inputs;
1189     return ConvertConstToWeights(node_def, inputs, tensor_or_weights);
1190   }
1191   if (!graph_properties_.HasOutputProperties(node_def.name())) {
1192     return errors::InvalidArgument("Shape and data type are unknown");
1193   }
1194 
1195   // Validate and convert shape and dtype.
1196   const auto& output_params =
1197       graph_properties_.GetOutputProperties(node_def.name());
1198   const auto& tensor_properties = output_params.at(output_port);
1199   const DataType dtype = tensor_properties.dtype();
1200   const PartialTensorShape shape = tensor_properties.shape();
1201   nvinfer1::DataType trt_dtype;
1202   nvinfer1::Dims trt_dims;
1203   int batch_size = -1;
1204   TF_RETURN_IF_ERROR(ValidateTensorProperties(
1205       node_def.op(), dtype, shape, use_implicit_batch_,
1206       /*validation_only_=*/true, &trt_dtype, &trt_dims, &batch_size));
1207 
1208   // Adds a fake ITensor. This is fine since op converter operates in
1209   // validation-only mode and it won't (and shouldn't) use the tensor to do
1210   // any TRT network operations.
1211   *tensor_or_weights = TRT_TensorOrWeights(trt_dtype, trt_dims, batch_size);
1212   return Status::OK();
1213 }
1214 
IsTensorRTCandidate(const Node * node)1215 Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) {
1216   const string& op = node->def().op();
1217   // In INT8 mode, we will always apply the quantization ranges provided by
1218   // these ops to the relevant tensors. This happens regardless of the value of
1219   // use_calibration.
1220   bool is_supported_op = false;
1221   if (quantize_ops->count(op)) {
1222     is_supported_op = (precision_mode_ == TrtPrecisionMode::INT8);
1223   } else {
1224     is_supported_op = op_validators_.count(op);
1225   }
1226   if (!is_supported_op) {
1227     return errors::Unimplemented("Op type ", op, " is not supported.");
1228   }
1229 
1230   // Convert input NodeDef and corresponding output ports to
1231   // TRT_TensorOrWeights.
1232   std::vector<TRT_TensorOrWeights> inputs;
1233   std::vector<const Edge*> input_edges;
1234   TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
1235   for (const Edge* edge : input_edges) {
1236     TRT_TensorOrWeights tensor_or_weights;
1237     const NodeDef& src_def = edge->src()->def();
1238     Status status = ConvertToTensorOrWeights(src_def, edge->src_output(),
1239                                              &tensor_or_weights);
1240     if (!status.ok()) {
1241       return errors::Internal(
1242           "Failed to convert input ", src_def.name(),
1243           " to a TRT_TensorOrWeights: ", status.error_message());
1244     }
1245     inputs.push_back(tensor_or_weights);
1246   }
1247 
1248   OpConverter validator = op_validators_[op];
1249   OpConverterParams params(node->def(), inputs, /*arg_outputs=*/nullptr,
1250                            &weight_store_, precision_mode_, use_calibration_,
1251                            use_implicit_batch_);
1252   return validator(&params);
1253 }
1254 
ConvertConstToWeights(const NodeDef & const_node_def,const std::vector<TRT_TensorOrWeights> & inputs,TRT_TensorOrWeights * output)1255 Status TrtNodeValidator::ConvertConstToWeights(
1256     const NodeDef& const_node_def,
1257     const std::vector<TRT_TensorOrWeights>& inputs,
1258     TRT_TensorOrWeights* output) {
1259   std::vector<TRT_TensorOrWeights> outputs;
1260   OpConverterParams params(const_node_def, inputs, &outputs, &weight_store_,
1261                            precision_mode_, use_calibration_,
1262                            use_implicit_batch_);
1263   Status status = op_validators_["Const"](&params);
1264   if (status.ok() && output) *output = outputs[0];
1265   return status;
1266 }
1267 
1268 // static
Create(TrtPrecisionMode precision_mode,bool use_calibration,nvinfer1::ILogger * trt_logger,const bool use_implicit_batch,absl::string_view engine_name)1269 StatusOr<std::unique_ptr<Converter>> Converter::Create(
1270     TrtPrecisionMode precision_mode, bool use_calibration,
1271     nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
1272     absl::string_view engine_name) {
1273   std::unique_ptr<Converter> converter = absl::WrapUnique(
1274       new Converter(precision_mode, use_calibration, trt_logger,
1275                     use_implicit_batch, engine_name));
1276   TF_RETURN_IF_ERROR(converter->Init(trt_logger));
1277   return converter;
1278 }
1279 
Converter(TrtPrecisionMode precision_mode,bool use_calibration,nvinfer1::ILogger * trt_logger,const bool use_implicit_batch,absl::string_view engine_name)1280 Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration,
1281                      nvinfer1::ILogger* trt_logger,
1282                      const bool use_implicit_batch,
1283                      absl::string_view engine_name)
1284     : precision_mode_(precision_mode),
1285       use_calibration_(use_calibration),
1286       use_implicit_batch_(use_implicit_batch),
1287       engine_name_(engine_name) {
1288   MaybeInitializeTrtPlugins(trt_logger);
1289   this->RegisterOpConverters();
1290 }
1291 
Init(nvinfer1::ILogger * trt_logger)1292 Status Converter::Init(nvinfer1::ILogger* trt_logger) {
1293   VLOG(1) << "Creating TensorRT builder";
1294   trt_builder_.reset(nvinfer1::createInferBuilder(*trt_logger));
1295 
1296   VLOG(1) << "Creating TensorRT network";
1297   const uint32_t flags =
1298       use_implicit_batch_
1299           ? 0U
1300           : (1U << static_cast<int>(
1301                  nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
1302   trt_network_.reset(trt_builder_->createNetworkV2(flags));
1303   if (!trt_network_) {
1304     return errors::Internal("Failed to create TensorRT network object");
1305   }
1306   return Status::OK();
1307 }
1308 
ConvertNode(const NodeDef & node_def)1309 Status Converter::ConvertNode(const NodeDef& node_def) {
1310   std::vector<TRT_TensorOrWeights> inputs, outputs;
1311   TF_RETURN_IF_ERROR(this->GetInputs(node_def, &inputs));
1312 
1313   OpConverterParams params(this, node_def, inputs, &outputs, &weight_store_);
1314   const string& op = node_def.op();
1315   auto itr = op_registry_.find(op);
1316   if (itr == op_registry_.end()) {
1317     return errors::Unimplemented("No converter registered for op: ", op);
1318   }
1319   OpConverter op_converter = itr->second;
1320   TF_RETURN_IF_ERROR(op_converter(&params));
1321 
1322   for (size_t i = 0; i < outputs.size(); ++i) {
1323     TRT_TensorOrWeights& output = outputs[i];
1324     string output_name = node_def.name();
1325     if (i != 0) absl::StrAppend(&output_name, ":", i);
1326     // We need to check the name before setting it. If the input is one of the
1327     // engine input, setting the name here will overwrite engine input
1328     // bindings which will cause runtime error.
1329     // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
1330     // in ConvertIdentity.
1331     if (output.is_tensor()) {
1332       const char* tensor_name = output.tensor()->getName();
1333       if (!IsEngineInput(tensor_name)) {
1334         // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
1335         // them to match their corresponding TensorFlow name.
1336         // Note: ITensors that we create internally within TF-TRT which are
1337         // not inputs or outputs of a node will not be renamed. This is a
1338         // potential cause of confusion if an error message or warning
1339         // mentions the unnamed tensor.
1340         output.tensor()->setName(output_name.c_str());
1341       }
1342     }
1343     VLOG(2) << "Adding out tensor " << output_name << ": "
1344             << output.DebugString();
1345     Status status = AddTensorOrWeights(output_name, output);
1346     if (!status.ok()) {
1347       return Status(status.code(),
1348                     StrCat("Failed to add output for node ", node_def.name(),
1349                            ": ", status.error_message()));
1350     }
1351   }
1352   return Status::OK();
1353 }
1354 
AddInputTensor(const string & name,nvinfer1::DataType dtype,const nvinfer1::Dims & dims,int batch_size)1355 Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
1356                                  const nvinfer1::Dims& dims, int batch_size) {
1357   // We verify the batch size only for the input nodes, and rely on individual
1358   // op converter to ensure the batch size of the outputs is not changed.
1359   // TODO(laigd): we need to test this properties.
1360   Status status;
1361   if (use_implicit_batch_) {
1362     status = MaybeUpdateBatchSize(batch_size);
1363     if (!status.ok()) {
1364       return Status(status.code(),
1365                     StrCat("Batch size doesn't match for tensor ", name, ": ",
1366                            status.error_message()));
1367     }
1368   }
1369   ITensorProxyPtr tensor = network()->addInput(name.c_str(), dtype, dims);
1370   if (*tensor == nullptr) {
1371     return errors::InvalidArgument("Failed to create Input layer tensor ", name,
1372                                    " rank=", dims.nbDims);
1373   }
1374   status = AddTensorOrWeights(name, TRT_TensorOrWeights(tensor));
1375   if (!status.ok()) {
1376     return Status(status.code(), StrCat("Failed to add input tensor ", name,
1377                                         ": ", status.error_message()));
1378   }
1379   return Status::OK();
1380 }
1381 
RenameAndMarkOutputTensors(const std::vector<Converter::EngineOutputInfo> & output_tensors)1382 Status Converter::RenameAndMarkOutputTensors(
1383     const std::vector<Converter::EngineOutputInfo>& output_tensors) {
1384   int output_index = 0;
1385   for (const auto& output : output_tensors) {
1386     TRT_TensorOrWeights tensor_or_weights;
1387     TF_RETURN_IF_ERROR(
1388         GetTensorOrWeights(output.source_tensor_name, &tensor_or_weights));
1389     if (!tensor_or_weights.is_tensor()) {
1390       return errors::InvalidArgument("Output ", output.source_tensor_name,
1391                                      " is weights not tensor");
1392     }
1393     ITensorProxyPtr tensor = tensor_or_weights.tensor();
1394     if (*tensor == nullptr) {
1395       return errors::NotFound("Output tensor not found: ",
1396                               output.source_tensor_name);
1397     }
1398     // Check if this tensor has already been marked as an input or output.
1399     //
1400     // ConvertIdentity can cause the same tensor to be repeated in
1401     // output_tensors, which can cause us to overwrite the name of the output
1402     // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
1403     // we won't be able to locate OutputPH_0 during runtime. To fix this,
1404     // duplicate the tensor using no-op shuffle.
1405     //
1406     // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
1407     // in ConvertIdentity.
1408     if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) {
1409       // Using shuffle layer for identity by not setting reshape or transpose.
1410       nvinfer1::IShuffleLayer* layer =
1411           network()->addShuffle(*tensor->trt_tensor());
1412       TFTRT_RETURN_ERROR_IF_NULLPTR(
1413           layer, StrCat("Output Copy for ", tensor->getName()));
1414       SetLayerName(layer, tensor->getName(), "shuffle", output_index);
1415       tensor = layer->getOutput(0);
1416     }
1417     tensor->setName(output.dest_node_name.c_str());
1418     network()->markOutput(*tensor->trt_tensor());
1419     // Set type after marking as output. TRT only supports setType for engine
1420     // outputs and inputs (type is inferred otherwise).
1421     tensor->setType(output.trt_dtype);
1422     output_index++;
1423     VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
1424             << " with data type " << DebugString(output.trt_dtype)
1425             << ", which feeds TF node " << output.dest_node_name;
1426   }
1427   if (VLOG_IS_ON(2)) {
1428     VLOG(2) << "Created TensorRT network with the following layers:";
1429     for (int i = 0; i < network()->getNbLayers(); i++) {
1430       auto layer = network()->getLayer(i);
1431       VLOG(2) << "    " << layer->getName() << " ("
1432               << "type: " << static_cast<int>(layer->getType())
1433               << ", precision: " << static_cast<int>(layer->getPrecision())
1434               << ")";
1435     }
1436   }
1437   return Status::OK();
1438 }
1439 
1440 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
1441 // An algorithm selector that always returns a specific ID for selectAlgorithms.
1442 // This is used to support the implementation of using environment variable
1443 // `TF_TRT_FIXED_ALGORITHM_ID` for debugging TensorRT.
1444 class StaticAlgorithmSelector : public nvinfer1::IAlgorithmSelector {
1445  private:
1446   int32_t algorithm_id_;
1447 
1448  public:
StaticAlgorithmSelector(int32_t algorithm_id)1449   StaticAlgorithmSelector(int32_t algorithm_id) : algorithm_id_(algorithm_id) {}
1450 
1451   // Returns value in [0, nbChoices] for a valid algorithm.
selectAlgorithms(const nvinfer1::IAlgorithmContext & algoContext,const nvinfer1::IAlgorithm * const * algoChoices,int32_t nbChoices,int32_t * selection)1452   int32_t selectAlgorithms(const nvinfer1::IAlgorithmContext& algoContext,
1453                            const nvinfer1::IAlgorithm* const* algoChoices,
1454                            int32_t nbChoices,
1455                            int32_t* selection) noexcept override {
1456     // TensorRT always provides more than zero number of algorithms
1457     // in selectAlgorithms.
1458     assert(nbChoices > 0);
1459 
1460     // making sure that the requested TRT algorithm ID doesn't go above the
1461     // max value accepted.
1462     selection[0] = std::min(algorithm_id_, nbChoices);
1463     return 1;
1464   }
1465 
1466   // Called by TensorRT to report choices it made.
reportAlgorithms(const nvinfer1::IAlgorithmContext * const * algoContexts,const nvinfer1::IAlgorithm * const * algoChoices,int32_t nbAlgorithms)1467   void reportAlgorithms(const nvinfer1::IAlgorithmContext* const* algoContexts,
1468                         const nvinfer1::IAlgorithm* const* algoChoices,
1469                         int32_t nbAlgorithms) noexcept override {
1470   }  // do nothing
1471 };
1472 #endif
1473 
BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine> * engine,int max_batch_size,size_t max_workspace_size_bytes,nvinfer1::IGpuAllocator * allocator,TRTInt8Calibrator * calibrator,TrtShapeOptimizationProfile * profiles)1474 Status Converter::BuildCudaEngine(
1475     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, int max_batch_size,
1476     size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator,
1477     TRTInt8Calibrator* calibrator, TrtShapeOptimizationProfile* profiles) {
1478   tensorflow::profiler::AnnotatedTraceMe activity(
1479       [&]() {
1480         return tensorflow::profiler::TraceMeOpOverride("TRTEngineOp",
1481                                                        "BuildEngine");
1482       },
1483       tensorflow::profiler::TraceMeLevel::kInfo);
1484 
1485   VLOG(1) << "Configuring TensorRT builder";
1486   trt_builder_->setMaxBatchSize(max_batch_size);
1487   trt_builder_->setGpuAllocator(allocator);
1488 
1489   // Create a network configuration and use it to build a TRT engine.
1490   TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(
1491       trt_builder_->createBuilderConfig());
1492   builder_config->setMaxWorkspaceSize(max_workspace_size_bytes);
1493 
1494 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
1495   static int32_t trt_algorithm_id = [] {
1496     int64 trt_algorithm_id;
1497     TF_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_TRT_FIXED_ALGORITHM_ID",
1498                                                 /*default_val=*/-1,
1499                                                 &trt_algorithm_id));
1500     return static_cast<int32_t>(trt_algorithm_id);
1501   }();
1502 
1503   if (trt_algorithm_id >= 0) {
1504     VLOG(1) << "Forcing TRT algorithm selection to: ID=" << trt_algorithm_id;
1505     StaticAlgorithmSelector trt_algorithm_selector(trt_algorithm_id);
1506     builder_config->setAlgorithmSelector(&trt_algorithm_selector);
1507   }
1508 #endif
1509 
1510 #if IS_TRT_VERSION_GE(8, 0, 0, 0)
1511   builder_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
1512   VLOG(1) << "Setting sparsity for TensorRT8!";
1513 #endif
1514 
1515   if (precision_mode_ == TrtPrecisionMode::FP16) {
1516     builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
1517   } else if (precision_mode_ == TrtPrecisionMode::INT8) {
1518     builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
1519     builder_config->setFlag(nvinfer1::BuilderFlag::kINT8);
1520     if (use_calibration_) {
1521       builder_config->setInt8Calibrator(calibrator);
1522     } else {
1523       builder_config->setInt8Calibrator(nullptr);
1524     }
1525   }
1526   if (!use_implicit_batch_ && profiles) {
1527     TF_RETURN_IF_ERROR(profiles->ConfigureBuilder(
1528         trt_builder_.get(), builder_config.get(), network()));
1529   }
1530 
1531   string precision_mode_str;
1532   TF_RETURN_IF_ERROR(
1533       TrtPrecisionModeToName(precision_mode_, &precision_mode_str));
1534   string trt_network_name = StrCat(
1535       "TF:", TF_VERSION_STRING, ", ",
1536       "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-",
1537       "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_,
1538       ", ", "Max-Batch-Size:", max_batch_size, ", ",
1539       "Max-Workspace-Size:", max_workspace_size_bytes);
1540   VLOG(1) << "Setting TensorRT network name to " << trt_network_name;
1541   network()->setName(trt_network_name.c_str());
1542 
1543   VLOG(1) << "Building TensorRT engine";
1544   if (VLOG_IS_ON(2)) {
1545     VLOG(2) << "Network inputs";
1546     int n_inputs = network()->getNbInputs();
1547     for (int i = 0; i < n_inputs; i++) {
1548       const ITensorProxyPtr input = network()->getInput(i);
1549       if (*input) {
1550         VLOG(2) << "  " << i << " " << input->getName();
1551       } else {
1552         VLOG(2) << "Could not find input " << i;
1553       }
1554     }
1555   }
1556   engine->reset(
1557       trt_builder_->buildEngineWithConfig(*network(), *builder_config));
1558   if (engine->get() == nullptr) {
1559     return errors::Internal("Failed to build TensorRT engine");
1560   }
1561   if (VLOG_IS_ON(2)) {
1562     VLOG(2) << "TRT engine created";
1563     int nbBindings = (*engine)->getNbBindings();
1564     VLOG(2) << "Number of engine bindings: " << nbBindings;
1565     for (int i = 0; i < nbBindings; i++) {
1566       VLOG(2) << "Binding " << i << " name: " << (*engine)->getBindingName(i);
1567     }
1568   }
1569   return Status::OK();
1570 }
1571 
MaybeUpdateBatchSize(int batch_size)1572 Status Converter::MaybeUpdateBatchSize(int batch_size) {
1573   // OK iff either is unknown or they equal to each other.
1574   if (this->batch_size_ < 0 || batch_size < 0 ||
1575       this->batch_size_ == batch_size) {
1576     if (this->batch_size_ < 0 && batch_size >= 0) {
1577       this->batch_size_ = batch_size;
1578     }
1579     return Status::OK();
1580   }
1581   return errors::InvalidArgument(
1582       "Provided batch size does not match converter batch size: ", batch_size,
1583       " vs ", batch_size_);
1584 }
1585 
AddTensorOrWeights(const string & name,TRT_TensorOrWeights input)1586 Status Converter::AddTensorOrWeights(const string& name,
1587                                      TRT_TensorOrWeights input) {
1588   // Set the batch size of the tensor, using batch size collected from the
1589   // input tensors to the TRT subgraph at the beginning of the conversion.
1590   // We rely on the individual op converter to understand the semantics of the
1591   // TF node, and make sure it doesn't change the batch size nor introduce
1592   // intra-element dependency inside the batch.
1593   if (use_implicit_batch_ && input.is_tensor()) {
1594     input.set_batch_size(batch_size_);
1595   }
1596   if (trt_tensors_.insert({name, std::move(input)}).second) return Status::OK();
1597   return errors::AlreadyExists("tensor/weights ", name, " already exist.");
1598 }
1599 
GetTensorOrWeights(const string & name,TRT_TensorOrWeights * output)1600 Status Converter::GetTensorOrWeights(const string& name,
1601                                      TRT_TensorOrWeights* output) {
1602   if (!trt_tensors_.count(name)) {
1603     return errors::NotFound("Tensor or weights with name ", name,
1604                             " could not be found.");
1605   }
1606   *output = trt_tensors_.at(name);
1607   return Status::OK();
1608 }
1609 
TransposeTensor(ITensorProxyPtr input_tensor,const std::vector<int> & order_with_batch_dim,ITensorProxyPtr * output_tensor,const NodeDef & node_def,absl::string_view sub_op_name)1610 Status Converter::TransposeTensor(ITensorProxyPtr input_tensor,
1611                                   const std::vector<int>& order_with_batch_dim,
1612                                   ITensorProxyPtr* output_tensor,
1613                                   const NodeDef& node_def,
1614                                   absl::string_view sub_op_name) {
1615   const auto dims = input_tensor->getDimensions();
1616   const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
1617                                              : order_with_batch_dim.size();
1618   if (order_size != size_t(dims.nbDims)) {
1619     return errors::InvalidArgument(
1620         "Rank of perm for transpose does not match with that of the input.");
1621   }
1622   if (use_implicit_batch_ && order_with_batch_dim[0] != 0) {
1623     return errors::Unimplemented(
1624         "Transpose at batch dimension is not supported.");
1625   }
1626 
1627   nvinfer1::IShuffleLayer* layer =
1628       this->network()->addShuffle(*input_tensor->trt_tensor());
1629   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
1630   SetLayerName(layer, node_def, sub_op_name);
1631 
1632   nvinfer1::Permutation permutation;
1633   if (use_implicit_batch_) {
1634     for (int32_t i = 0; i < dims.nbDims; ++i) {
1635       permutation.order[i] = order_with_batch_dim[i + 1] - 1;
1636     }
1637   } else {
1638     std::copy(order_with_batch_dim.begin(), order_with_batch_dim.end(),
1639               permutation.order);
1640   }
1641   VLOG(1) << "TransposeTensor permutation: "
1642           << DebugString(permutation, dims.nbDims);
1643   layer->setFirstTranspose(permutation);
1644 
1645   nvinfer1::Dims reshape_dims;
1646   reshape_dims.nbDims = dims.nbDims;
1647   for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
1648     reshape_dims.d[i] = 0;
1649   }
1650   layer->setReshapeDimensions(reshape_dims);
1651 
1652   *output_tensor = layer->getOutput(0);
1653   return Status::OK();
1654 }
1655 
GetWeightRange(const TRT_ShapedWeights & weights,float * out_min,float * out_max) const1656 Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
1657                                  float* out_min, float* out_max) const {
1658   switch (weights.TrtDType()) {
1659     case nvinfer1::DataType::kFLOAT: {
1660       auto inp = static_cast<float const*>(weights.GetValues());
1661       auto result = std::minmax_element(inp, inp + weights.count());
1662       *out_min = *result.first;
1663       *out_max = *result.second;
1664       break;
1665     }
1666     case nvinfer1::DataType::kHALF: {
1667       auto inp = static_cast<Eigen::half const*>(weights.GetValues());
1668       auto result = std::minmax_element(inp, inp + weights.count());
1669       *out_min = static_cast<float>(*result.first);
1670       *out_max = static_cast<float>(*result.second);
1671       break;
1672     }
1673     case nvinfer1::DataType::kINT32: {
1674       auto inp = static_cast<int const*>(weights.GetValues());
1675       auto result = std::minmax_element(inp, inp + weights.count());
1676       *out_min = static_cast<float>(*result.first);
1677       *out_max = static_cast<float>(*result.second);
1678       break;
1679     }
1680     default:
1681       return errors::Unimplemented(
1682           "Data type not supported for GetWeightRange: ",
1683           DebugString(weights.TrtDType()));
1684   }
1685   return Status::OK();
1686 }
1687 
1688 // Constructs <tf_related_part> for the ILayer name as
1689 // <tf_node_def_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper
1690 // to set the name for the ILayer.
1691 //
1692 // If the operation represented by the ILayer is generated by the converter to
1693 // support the conversion of node_def, callers need to specify a non-empty
1694 // sub_op_name to be appended to the name of node_def to avoid layer name
1695 // conflicts. If the operation is generated multiple times, callers also need
1696 // to specify sub_op_instance to be appended to the name of the layers to avoid
1697 // layer name conflicts.
SetLayerName(nvinfer1::ILayer * layer,const NodeDef & node_def,absl::string_view sub_op_name,absl::optional<int> sub_op_instance,absl::optional<std::string> origin_node_name)1698 void Converter::SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
1699                              absl::string_view sub_op_name,
1700                              absl::optional<int> sub_op_instance,
1701                              absl::optional<std::string> origin_node_name) {
1702   std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance);
1703   if (sub_op_suffix.empty()) {
1704     SetLayerNameHelper(layer, engine_name_, node_def.name());
1705   } else if (origin_node_name.has_value()) {
1706     SetLayerNameHelper(layer, engine_name_,
1707                        absl::StrCat(node_def.name(), "-",
1708                                     absl::string_view(origin_node_name.value()),
1709                                     "-", sub_op_suffix));
1710   } else {
1711     SetLayerNameHelper(layer, engine_name_,
1712                        absl::StrCat(node_def.name(), "-", sub_op_suffix));
1713   }
1714 }
1715 
1716 // Constructs <tf_related_part> for the ILayer name as
1717 // <main_op_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper to
1718 // set the name for the ILayer.
SetLayerName(nvinfer1::ILayer * layer,absl::string_view main_op_name,absl::string_view sub_op_name,absl::optional<int> sub_op_instance)1719 void Converter::SetLayerName(nvinfer1::ILayer* layer,
1720                              absl::string_view main_op_name,
1721                              absl::string_view sub_op_name,
1722                              absl::optional<int> sub_op_instance) {
1723   std::string layer_name_suffix =
1724       GetLayerNameSuffix(sub_op_name, sub_op_instance);
1725   SetLayerNameHelper(layer, engine_name_,
1726                      absl::StrCat(main_op_name, "-", layer_name_suffix));
1727 }
1728 
1729 // Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
1730 // (which doesn't contain the batch dimension).
1731 //
1732 // If validation_only is true, it doesn't do the conversion but only do some
1733 // minimum validation for the eligibility of the conversion, and *tensor will
1734 // be set to nullptr.
PrepareTensorForShape(Converter * converter,const TRT_TensorOrWeights & input,const nvinfer1::Dims & dims,const bool validation_only,ITensorProxyPtr * tensor,const NodeDef & node_def,absl::optional<int> op_instance,absl::optional<std::string> origin_node_name)1735 Status PrepareTensorForShape(Converter* converter,
1736                              const TRT_TensorOrWeights& input,
1737                              const nvinfer1::Dims& dims,
1738                              const bool validation_only,
1739                              ITensorProxyPtr* tensor, const NodeDef& node_def,
1740                              absl::optional<int> op_instance,
1741                              absl::optional<std::string> origin_node_name) {
1742   const nvinfer1::Dims input_dims = input.GetTrtDims();
1743   // The input shape may have -1s for dynamic shape. The target shape may have
1744   // 0s representing copy over the corresponding input dimensions. It may also
1745   // have at most one -1 representing a dimension value that needs to be
1746   // inferred. If none of those special values present, we verify that the total
1747   // sizes of the input and output shape are the same.
1748   // TODO(tfeher): Verify that the total sizes of the input and output shape are
1749   // the same in the present of 0s but no -1 in the target shape.
1750   // If an input is a weight, it is going to become a tensor via
1751   // CreateConstantLayer. So we can treat it as a tensor for
1752   // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors.
1753   if (Prod(dims) > 0 && AreDimsStaticWithDifferentSize(input_dims, dims)) {
1754     return errors::InvalidArgument(
1755         "Incompatible shapes: ", DebugString(input_dims), " vs. ",
1756         DebugString(dims));
1757   }
1758   // ConstantLayer requires static shapes (cannot infer -1).
1759   if (input.is_weights() && !HasStaticShape(dims)) {
1760     return errors::InvalidArgument("Shape is not fully defined: ",
1761                                    DebugString(dims));
1762   }
1763   if (validation_only) {
1764     *tensor = nullptr;
1765     return Status::OK();
1766   }
1767 
1768   TFTRT_RETURN_ERROR_IF_NULLPTR(converter, "converter is nullptr");
1769   if (input.is_tensor()) {
1770     if (DimsEqual(input_dims, dims)) {
1771       *tensor = input.tensor();
1772     } else {
1773       nvinfer1::IShuffleLayer* layer =
1774           converter->network()->addShuffle(*input.tensor()->trt_tensor());
1775       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
1776       converter->SetLayerName(layer, node_def, "shuffle", op_instance,
1777                               origin_node_name);
1778       layer->setReshapeDimensions(dims);
1779       *tensor = layer->getOutput(0);
1780     }
1781   } else {
1782     *tensor = converter->CreateConstantLayer(input.weights(), dims);
1783     TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
1784   }
1785   return Status::OK();
1786 }
1787 
ProvideQuantizationRange(ITensorProxyPtr * tensor,float min_range,float max_range)1788 void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor,
1789                                          float min_range, float max_range) {
1790   float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
1791   if ((*tensor)->is_trt_tensor()) {
1792     quantization_ranges_[(*tensor)->trt_tensor()] = symmetric_range;
1793   } else if ((*tensor)->is_simple_tensor()) {
1794     quantization_ranges_proxy_[tensor] = symmetric_range;
1795   }
1796 }
1797 
1798 namespace {
1799 
IsConvolution(const nvinfer1::ILayer * layer)1800 bool IsConvolution(const nvinfer1::ILayer* layer) {
1801   return layer->getType() == nvinfer1::LayerType::kCONVOLUTION;
1802 }
1803 
IsScale(const nvinfer1::ILayer * layer)1804 bool IsScale(const nvinfer1::ILayer* layer) {
1805   return layer->getType() == nvinfer1::LayerType::kSCALE;
1806 }
1807 
IsClipOrRelu(const nvinfer1::ILayer * layer)1808 bool IsClipOrRelu(const nvinfer1::ILayer* layer) {
1809   if (layer->getType() != nvinfer1::LayerType::kACTIVATION) {
1810     return false;
1811   }
1812   auto activation_type = static_cast<const nvinfer1::IActivationLayer*>(layer)
1813                              ->getActivationType();
1814 
1815   return activation_type == nvinfer1::ActivationType::kRELU ||
1816          activation_type == nvinfer1::ActivationType::kCLIP;
1817 }
1818 
IsAdd(const nvinfer1::ILayer * layer)1819 bool IsAdd(const nvinfer1::ILayer* layer) {
1820   if (layer->getType() != nvinfer1::LayerType::kELEMENTWISE) {
1821     return false;
1822   }
1823   auto operation =
1824       static_cast<const nvinfer1::IElementWiseLayer*>(layer)->getOperation();
1825   return operation == nvinfer1::ElementWiseOperation::kSUM;
1826 }
1827 
1828 }  // namespace
1829 
MaybeApplyQuantizationRanges()1830 void Converter::MaybeApplyQuantizationRanges() {
1831   if (precision_mode() != TrtPrecisionMode::INT8) return;
1832 
1833   // Apply ranges.
1834   for (auto pair : quantization_ranges_) {
1835     nvinfer1::ITensor* tensor = pair.first;
1836     const float range = pair.second;
1837     VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
1838     // TODO(laigd): if 'tensor' already has a range set which doesn't match
1839     // 'range', it should report error.
1840     tensor->setDynamicRange(-range, range);
1841   }
1842   for (auto pair : quantization_ranges_proxy_) {
1843     ITensorProxyPtr tensor = *pair.first;
1844     const float range = pair.second;
1845     VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
1846     // TODO(laigd): if 'tensor' already has a range set which doesn't match
1847     // 'range', it should report error.
1848     tensor->setDynamicRange(-range, range);
1849   }
1850 }
1851 
GetInputs(const NodeDef & node_def,std::vector<TRT_TensorOrWeights> * inputs) const1852 Status Converter::GetInputs(const NodeDef& node_def,
1853                             std::vector<TRT_TensorOrWeights>* inputs) const {
1854   for (auto const& input_name : node_def.input()) {
1855     /*************************************************************************
1856      * TODO(jie): handle case 1) here.
1857      * Normalizes the inputs and extracts associated metadata:
1858      * 1) Inputs can contain a colon followed by a suffix of characters.
1859      *    That suffix may be a single number (e.g. inputName:1) or several
1860      *    word characters separated from a number by a colon
1861      *    (e.g. inputName:foo:1). The
1862      *    latter case is used to denote inputs and outputs of functions.
1863      * 2) Control dependency inputs contain caret at the beginning and we
1864      *    remove this and annotate the edge as a control dependency.
1865      ************************************************************************/
1866     // skip control nodes
1867     if (input_name[0] == '^') continue;
1868     string name = input_name;
1869     auto last = name.find_last_of(':');
1870     // TODO(aaroey): use TensorId
1871     if (last != string::npos && last + 2 == name.size() &&
1872         name[last + 1] == '0') {
1873       name.erase(last);
1874     }
1875 
1876     if (trt_tensors_.count(name)) {
1877       TRT_TensorOrWeights input = trt_tensors_.at(name);
1878       inputs->push_back(input);
1879       VLOG(2) << "Retrieved input " << name << ": " << input.DebugString();
1880     } else {
1881       // TODO(aaroey): this should not happen, make it a CHECK.
1882       // TODO(aaroey): use StrCat for pattern like this.
1883       string msg("Node ");
1884       StrAppend(&msg, node_def.name(), " should have an input named '", name,
1885                 "' but it is not available");
1886       LOG(ERROR) << msg;
1887       return errors::InvalidArgument(msg);
1888     }
1889   }
1890   return Status::OK();
1891 }
1892 
1893 enum class TrtInputArg { kTensor = 1, kWeight = 2, kBoth = 3 };
1894 
1895 // Checks that the number of inputs match, and enforces that the inputs marked
1896 // as weights are constant. Inputs are allowed to be both weight and tensor.
CheckInputsWeights(const OpConverterParams & params,const std::vector<std::pair<string,TrtInputArg>> & expected_inputs)1897 Status CheckInputsWeights(
1898     const OpConverterParams& params,
1899     const std::vector<std::pair<string, TrtInputArg>>& expected_inputs) {
1900   const auto& inputs = params.inputs;
1901   const auto& node_def = params.node_def;
1902   if (inputs.size() != expected_inputs.size()) {
1903     return errors::InvalidArgument(
1904         node_def.op(), " got ", inputs.size(), " inputs but expected ",
1905         expected_inputs.size(), ", at ", node_def.name());
1906   }
1907   for (int i = 0; i < inputs.size(); i++) {
1908     if (expected_inputs[i].second == TrtInputArg::kWeight &&
1909         inputs.at(i).is_tensor()) {
1910       return errors::Unimplemented("The input \"", expected_inputs[i].first,
1911                                    "\" for ", node_def.op(),
1912                                    " must be a constant, at ", node_def.name());
1913     }
1914     // TODO(tfeher): Remove this check and provide a method to automatically
1915     // retrieve an input as a tensor, converting via CreateConstantLayer if it
1916     // was originally a weight. We will want a caching mechanism to prevent many
1917     // duplicate constants from being created.
1918     if (expected_inputs[i].second == TrtInputArg::kTensor &&
1919         inputs.at(i).is_weights()) {
1920       return errors::Unimplemented("The input \"", expected_inputs[i].first,
1921                                    "\" for ", node_def.op(),
1922                                    " must be a tensor, at ", node_def.name());
1923     }
1924   }
1925   return Status::OK();
1926 }
1927 
1928 // Checks that the number of inputs match, and enforces that the inputs marked
1929 // as true are constant weights. true means that the input must be a weight,
1930 // while false means the input must be a tensor.
CheckInputsWeights(const OpConverterParams & params,const std::vector<std::pair<string,bool>> & inputs_is_weight)1931 Status CheckInputsWeights(
1932     const OpConverterParams& params,
1933     const std::vector<std::pair<string, bool>>& inputs_is_weight) {
1934   std::vector<std::pair<string, TrtInputArg>> expected_inputs;
1935   expected_inputs.reserve(inputs_is_weight.size());
1936   std::transform(
1937       inputs_is_weight.begin(), inputs_is_weight.end(),
1938       std::back_inserter(expected_inputs), [](std::pair<string, bool> x) {
1939         return std::make_pair(
1940             x.first, x.second ? TrtInputArg::kWeight : TrtInputArg::kTensor);
1941       });
1942   return CheckInputsWeights(params, expected_inputs);
1943 }
1944 
GetNodeDefTfType(const NodeDef & node_def,DataType * tf_type,const char * type_attr_name)1945 Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
1946                         const char* type_attr_name) {
1947   TFAttrs attrs(node_def);
1948   if (!attrs.count(type_attr_name)) {
1949     return errors::InvalidArgument("Attribute with name ", type_attr_name,
1950                                    " not found.");
1951   }
1952   *tf_type = attrs.get<DataType>(type_attr_name);
1953   return Status::OK();
1954 }
1955 
GetInputTfType(const OpConverterParams & params,DataType * tf_type,int pos)1956 Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
1957                       int pos) {
1958   const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
1959   if (inputs.size() <= pos) {
1960     return errors::Internal("Invalid input position");
1961   }
1962 
1963   return inputs[pos].GetTfType(tf_type);
1964 }
1965 
1966 constexpr const char kOutputTypeAttrName[] = "T";
1967 
GetOutputTfType(const OpConverterParams & params,DataType * tf_type)1968 Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
1969   return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
1970 }
1971 
AllowDataTypes(const OpConverterParams & params,const std::set<DataType> & allowed_types,const char * type_attr_name=kOutputTypeAttrName)1972 Status AllowDataTypes(const OpConverterParams& params,
1973                       const std::set<DataType>& allowed_types,
1974                       const char* type_attr_name = kOutputTypeAttrName) {
1975   const auto& node_def = params.node_def;
1976   DataType tf_type;
1977   TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
1978   if (!allowed_types.count(tf_type)) {
1979     string allowed_types_string = absl::StrJoin(
1980         allowed_types, ", ", [](string* out, const DataType& type) {
1981           absl::StrAppendFormat(out, "%s", DataTypeString(type));
1982         });
1983     return errors::Unimplemented("Data type ", DataTypeString(tf_type),
1984                                  " is not supported for ", node_def.op(),
1985                                  ", must be one of [", allowed_types_string,
1986                                  "], at ", node_def.name());
1987   }
1988   return Status::OK();
1989 }
1990 
1991 // ****************************************************************************
1992 // Constant folding functions for weights.
1993 // TODO(laigd): we should probably use eigen directly.
1994 // *****************************************************************************
1995 struct LambdaFactory {
1996   enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
1997   OP_CATEGORY op;
1998 
1999   template <typename T>
unarytensorflow::tensorrt::convert::LambdaFactory2000   std::function<T(T)> unary() {
2001     switch (op) {
2002       case OP_CATEGORY::RSQRT: {
2003         VLOG(2) << "RSQRT GETS DONE";
2004         return [](T t) -> T { return 1.0 / std::sqrt(t); };
2005       }
2006       case OP_CATEGORY::NEG:
2007         return [](T t) -> T { return -t; };
2008       case OP_CATEGORY::RECIP:
2009         return [](T t) -> T { return 1.0 / t; };
2010       default:
2011         LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
2012         return nullptr;
2013     }
2014   }
2015 };
2016 
2017 template <>
unary()2018 std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
2019   switch (op) {
2020     case OP_CATEGORY::RSQRT: {
2021       VLOG(2) << "RSQRT GETS DONE";
2022       return [](Eigen::half t) {
2023         return Eigen::half(1.0 / std::sqrt(static_cast<float>(t)));
2024       };
2025     }
2026     case OP_CATEGORY::NEG:
2027       return [](Eigen::half t) { return -t; };
2028     case OP_CATEGORY::RECIP:
2029       return [](Eigen::half t) {
2030         return Eigen::half(1.0 / static_cast<float>(t));
2031       };
2032     default:
2033       LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
2034       return nullptr;
2035   }
2036 }
2037 
UnaryCompute(const TRT_ShapedWeights & iweights,TRT_ShapedWeights * oweights,LambdaFactory unary_op)2038 Status UnaryCompute(const TRT_ShapedWeights& iweights,
2039                     TRT_ShapedWeights* oweights, LambdaFactory unary_op) {
2040   CHECK(iweights.TrtDType() == oweights->TrtDType());
2041   switch (iweights.TrtDType()) {
2042     case nvinfer1::DataType::kFLOAT: {
2043       auto inp = static_cast<float const*>(iweights.GetValues());
2044       auto oup = static_cast<float*>(oweights->GetValues());
2045       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
2046       break;
2047     }
2048     case nvinfer1::DataType::kHALF: {
2049       auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
2050       auto oup = static_cast<Eigen::half*>(oweights->GetValues());
2051       std::transform(inp, inp + iweights.count(), oup,
2052                      unary_op.unary<Eigen::half>());
2053       break;
2054     }
2055     default:
2056       return errors::Unimplemented("Data type not supported: ",
2057                                    DebugString(iweights.TrtDType()));
2058   }
2059   return Status::OK();
2060 }
2061 
Conv2DPaddingHelper(OpConverterParams * params,const TFAttrs & attrs,const nvinfer1::DimsHW & kernel_size,const nvinfer1::DimsHW & dilation,const nvinfer1::DimsHW & stride,const std::vector<int64_t> & input_dims,ITensorProxyPtr tensor,std::vector<std::pair<int,int>> * padding,ITensorProxyPtr * padded_tensor)2062 Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
2063                            const nvinfer1::DimsHW& kernel_size,
2064                            const nvinfer1::DimsHW& dilation,
2065                            const nvinfer1::DimsHW& stride,
2066                            const std::vector<int64_t>& input_dims,
2067                            ITensorProxyPtr tensor,
2068                            std::vector<std::pair<int, int>>* padding,
2069                            ITensorProxyPtr* padded_tensor) {
2070   if (attrs.get<string>("padding") == "SAME") {
2071     nvinfer1::DimsHW effective_kernel_size = kernel_size;
2072     effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
2073     effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
2074     *padding = CreateSamePadding(stride, effective_kernel_size, input_dims);
2075   } else {
2076     *padding = {{0, 0}, {0, 0}};
2077   }
2078 
2079   if ((*padding)[0].first != (*padding)[0].second ||
2080       (*padding)[1].first != (*padding)[1].second) {
2081     auto pad_layer = params->converter->network()->addPadding(
2082         *tensor->trt_tensor(),
2083         nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first),
2084         nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second));
2085     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name());
2086     params->converter->SetLayerName(pad_layer, params->node_def, "pad");
2087     tensor = pad_layer->getOutput(0);
2088     *padding = {{0, 0}, {0, 0}};
2089   }
2090   *padded_tensor = tensor;
2091   return Status::OK();
2092 }
2093 
2094 namespace {
2095 // Extracts the spatial dimensions from `output_sizes` and returns them as a
2096 // vector of size 2.
GetSpatialDimsFromOutputSizes(const TRT_TensorOrWeights & output_sizes,const int h_index,const int w_index)2097 std::vector<int64_t> GetSpatialDimsFromOutputSizes(
2098     const TRT_TensorOrWeights& output_sizes, const int h_index,
2099     const int w_index) {
2100   // We use h_index and w_index instead of 1 and 2 because we haven't
2101   // transposed output_sizes along with the input.
2102   const TRT_ShapedWeights& weights = output_sizes.weights();
2103   const int output_sizes_length = weights.count();
2104   auto output_sizes_values = static_cast<int*>(weights.GetValues());
2105   // The length of output_sizes can be 2 or 4. When the length is 4,
2106   // output_sizes represents <height,width>.
2107   return {output_sizes_values[output_sizes_length == 4 ? h_index : 0],
2108           output_sizes_values[output_sizes_length == 4 ? w_index : 1]};
2109 }
2110 }  // namespace
2111 
ConvertConv2DHelper(OpConverterParams * params,int group,bool is_conv2d_backprop_input)2112 Status ConvertConv2DHelper(OpConverterParams* params, int group,
2113                            bool is_conv2d_backprop_input) {
2114   const auto& inputs = params->inputs;
2115   const auto& node_def = params->node_def;
2116   TRT_TensorOrWeights backprop_output_size;
2117   ITensorProxyPtr tensor = nullptr;
2118   if (is_conv2d_backprop_input) {
2119     // In the case when Conv2dBackpropInput is used for conv2d_transpose, these
2120     // inputs correspond to: output size, filter, and input.
2121     TF_RETURN_IF_ERROR(CheckInputsWeights(
2122         *params,
2123         {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
2124     backprop_output_size = inputs.at(0);
2125     tensor = inputs.at(2).tensor();
2126     if (!HasStaticShape(tensor->getDimensions())) {
2127       // TODO(tfeher): Allow dynamic input. We need to implement padding
2128       // correction for dynamic shapes in this case.
2129       return errors::Unimplemented(
2130           "Conv2dBackpropInput does not support input with unknown shape, at ",
2131           node_def.name());
2132     }
2133   } else {
2134     TF_RETURN_IF_ERROR(
2135         CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
2136     tensor = inputs.at(0).tensor();
2137   }
2138   TF_RETURN_IF_ERROR(
2139       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
2140   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
2141   if (weights_rsck.shape_.nbDims != 4) {
2142     return errors::InvalidArgument("Conv2D expects kernel of dimension 4, at " +
2143                                    node_def.name());
2144   }
2145   TFAttrs attrs(node_def);
2146   auto data_format = attrs.get<string>("data_format");
2147   int c_index = (data_format == "NHWC") ? 3 : 1;
2148   int h_index = (data_format == "NHWC") ? 1 : 2;
2149   int w_index = (data_format == "NHWC") ? 2 : 3;
2150   auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
2151   if (tf_dilations.size() != 4) {
2152     return errors::InvalidArgument(
2153         "Convolution dilations field must specify 4 dimensions, at ",
2154         node_def.name());
2155   }
2156   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
2157     return errors::Unimplemented(
2158         "Dilation rate must be 1 for batch and channel dimensions, at ",
2159         node_def.name());
2160   }
2161   const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
2162   if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) {
2163     return errors::Unimplemented(
2164         "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported",
2165         ", at ", node_def.name());
2166   }
2167 
2168   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
2169   if (tf_stride.size() != 4) {
2170     return errors::InvalidArgument(
2171         "Convolution strides field must specify 4 dimensions, at ",
2172         node_def.name());
2173   }
2174   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
2175     return errors::Unimplemented(
2176         "Stride must be 1 for batch and channel dimensions, at ",
2177         node_def.name());
2178   }
2179   // Channel dim must be static for DepthwiseConv2dNative since we use that
2180   // value for num_groups at build time.
2181   if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) {
2182     return errors::InvalidArgument("Channel dimension must be static, at ",
2183                                    node_def.name());
2184   }
2185   string padding = attrs.get<string>("padding");
2186   if (padding != "SAME" && padding != "VALID") {
2187     return errors::Unimplemented(padding +
2188                                  " padding type not implemented, "
2189                                  "only VALID and SAME are supported");
2190   }
2191   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
2192   if (params->validation_only) return Status::OK();
2193 
2194   // Transpose to NCHW (NCHW is required for IConvLayer).
2195   const bool need_transpose = (data_format == "NHWC");
2196   if (need_transpose) {
2197     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2198         tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
2199   }
2200   // Dimensions of transposed tensor.
2201   const auto tensor_dim = tensor->getDimensions();
2202   const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1];
2203 
2204   // group == 0 signifies that this is a depthwise convolution, so set
2205   // num_groups to size of input's channel dim. For a non-depthwise conv,
2206   // num_groups will be 1.
2207   const int num_groups = (group == 0) ? c_dim_size : group;
2208 
2209   // For conv, TF weights are RSCK, and TRT expects KCRS.
2210   // For backprop, TF weights are RSKC, and TRT expects CKRS.
2211   // Therefore, this reorder will work for both cases.
2212   TRT_ShapedWeights weights =
2213       params->weight_store->GetTempWeights(weights_rsck);
2214   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
2215   TRT_ShapedWeights biases(weights.TrtDType());
2216   const int output_axis = is_conv2d_backprop_input ? 1 : 0;
2217   const int noutput = weights.shape_.d[output_axis] * num_groups;
2218   nvinfer1::DimsHW kernel_size;
2219   kernel_size.h() = weights.shape_.d[2];
2220   kernel_size.w() = weights.shape_.d[3];
2221 
2222   // Add convolution.
2223   nvinfer1::ILayer* conv_layer = nullptr;
2224   if (is_conv2d_backprop_input) {
2225     nvinfer1::IDeconvolutionLayer* layer =
2226         params->converter->network()->addDeconvolution(
2227             *tensor->trt_tensor(), noutput, kernel_size,
2228             weights.GetTrtWeights(), biases.GetTrtWeights());
2229     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
2230     layer->setStride(stride);
2231     // VALID padding is the default TRT behavior.
2232     if (attrs.get<string>("padding") == "SAME") {
2233       // SAME_UPPER means that post padding is preferred.
2234       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
2235     }
2236     layer->setNbGroups(num_groups);
2237     conv_layer = layer;
2238   } else {
2239     nvinfer1::IConvolutionLayer* layer =
2240         params->converter->network()->addConvolution(
2241             *tensor->trt_tensor(), noutput, kernel_size,
2242             weights.GetTrtWeights(), biases.GetTrtWeights());
2243     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
2244     layer->setStride(stride);
2245     if (attrs.get<string>("padding") == "SAME") {
2246       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
2247     }
2248     layer->setNbGroups(num_groups);
2249     layer->setDilation(dilation);
2250     conv_layer = layer;
2251   }
2252   params->converter->SetLayerName(conv_layer, node_def, "conv");
2253   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
2254   // Add an extra padding for Deconv because TRT doesn't accept the
2255   // argument output_shape and thus the TRT output shape could be wrong
2256   // in case of strides>1.
2257   if (is_conv2d_backprop_input) {
2258     std::vector<int64_t> output_spatial_dims =
2259         GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index);
2260     const int output_height = output_spatial_dims[0];
2261     const int output_width = output_spatial_dims[1];
2262     nvinfer1::Dims trt_output_shape = output_tensor->getDimensions();
2263     // What determines the padding size is the difference between the given
2264     // input_sizes (tf_output_shape) and TRT computed size.
2265     int out_h_idx = params->use_implicit_batch ? 1 : 2;
2266     int out_w_idx = params->use_implicit_batch ? 2 : 3;
2267     const int height_diff = output_height - trt_output_shape.d[out_h_idx];
2268     const int width_diff = output_width - trt_output_shape.d[out_w_idx];
2269     if ((height_diff < 0) || (width_diff < 0)) {
2270       return errors::InvalidArgument(
2271           "input_sizes argument of Conv2DBackprop (i.e. output_shape argument "
2272           "of conv2d_transpose) ",
2273           "is too small for the given out_backprop argument of Conv2DBackprop "
2274           "(i.e. input argument of conv2d_transpose). Expect: ",
2275           "(", output_height, ", ", output_width, ") >= ", "(",
2276           trt_output_shape.d[out_h_idx], ", ", trt_output_shape.d[out_w_idx],
2277           ") for op ", node_def.name());
2278     }
2279     // Only add a padding layer if padding sizes are larger than 0
2280     if ((height_diff > 0) || (width_diff > 0)) {
2281       nvinfer1::DimsHW pre_padding(0, 0);
2282       nvinfer1::DimsHW post_padding(height_diff, width_diff);
2283       nvinfer1::IPaddingLayer* padding_layer =
2284           params->converter->network()->addPadding(*output_tensor->trt_tensor(),
2285                                                    pre_padding, post_padding);
2286       output_tensor = padding_layer->getOutput(0);
2287       params->converter->SetLayerName(padding_layer, node_def, "pad");
2288     }
2289   }
2290   // Restore transpose.
2291   if (need_transpose) {
2292     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2293         output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
2294   }
2295   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2296   return Status::OK();
2297 }
2298 
AllowInefficientTranspose()2299 bool AllowInefficientTranspose() {
2300   static bool result = [] {
2301     bool value;
2302     Status status =
2303         ReadBoolFromEnvVar("TF_DEBUG_TRT_ALLOW_INEFFICIENT_TRANSPOSE",
2304                            /*default_value=*/false, &value);
2305     if (!status.ok()) {
2306       LOG(ERROR) << status;
2307     }
2308     return value;
2309   }();
2310 
2311   return result;
2312 }
2313 
ConvertTranspose(OpConverterParams * params)2314 Status ConvertTranspose(OpConverterParams* params) {
2315   const auto& inputs = params->inputs;
2316   TF_RETURN_IF_ERROR(
2317       CheckInputsWeights(*params, {{"x", false}, {"perm", true}}));
2318   TF_RETURN_IF_ERROR(AllowDataTypes(
2319       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2320   // Get the permutation from weights.
2321   TRT_ShapedWeights weights = inputs.at(1).weights();
2322   const int* weights_ptr = static_cast<int*>(weights.GetValues());
2323   std::vector<int> perm(weights_ptr, weights_ptr + weights.count());
2324 
2325   // Verify the permutation.
2326   ITensorProxyPtr input_tensor = inputs.at(0).tensor();
2327   const int perm_size =
2328       params->use_implicit_batch ? perm.size() - 1 : perm.size();
2329   if (perm_size != size_t(input_tensor->getDimensions().nbDims)) {
2330     return errors::InvalidArgument(
2331         "Rank of perm for transpose does not match with that of the input.");
2332   }
2333   if (params->use_implicit_batch && perm[0] != 0) {
2334     return errors::Unimplemented(
2335         "Transpose at batch dimension is not supported.");
2336   }
2337 
2338 #if !IS_TRT_VERSION_GE(7, 1, 3, 4)
2339   // TensorRT versions before 7.1.3.4 is slow transposing large tensors.
2340   // So check tensor size, and don't convert if it is too large.
2341   constexpr int64_t kMaxEfficientTranspose = 2500000;
2342   int64_t tensor_size = TrtTensorDimsNumElements(input_tensor->getDimensions());
2343   if (!AllowInefficientTranspose() && tensor_size > kMaxEfficientTranspose) {
2344     return errors::Unimplemented(StrCat("Transpose too large:", tensor_size));
2345   }
2346 #endif
2347 
2348   if (params->validation_only) return Status::OK();
2349 
2350   // Start conversion.
2351   ITensorProxyPtr output_tensor = nullptr;
2352   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
2353       input_tensor, perm, &output_tensor, params->node_def));
2354   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2355   return Status::OK();
2356 }
2357 
ConvertShape(OpConverterParams * params)2358 Status ConvertShape(OpConverterParams* params) {
2359   const auto& inputs = params->inputs;
2360   TF_RETURN_IF_ERROR(
2361       CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}}));
2362   if (params->use_implicit_batch) {
2363     return errors::Unimplemented(
2364         "Shape is only supported for explicit batch mode.");
2365   }
2366   if (HasStaticShape(inputs.at(0).GetTrtDims())) {
2367     if (params->validation_only) return Status::OK();
2368     nvinfer1::Dims input_dims = inputs.at(0).GetTrtDims();
2369     nvinfer1::Dims output_dims{1, {input_dims.nbDims}};
2370     // Create a const node with the values of output_dims
2371     TRT_ShapedWeights weight = params->weight_store->GetTempWeights(
2372         nvinfer1::DataType::kINT32, output_dims);
2373     int32* values_ptr = static_cast<int32*>(weight.GetValues());
2374     std::copy(input_dims.d, input_dims.d + input_dims.nbDims, values_ptr);
2375     auto output = params->converter->CreateConstantLayer(weight, output_dims);
2376     params->outputs->push_back(TRT_TensorOrWeights(output));
2377     return Status::OK();
2378   }
2379   if (params->validation_only) return Status::OK();
2380   nvinfer1::IShapeLayer* shape_layer = params->converter->network()->addShape(
2381       *inputs.at(0).tensor()->trt_tensor());
2382   TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
2383   params->converter->SetLayerName(shape_layer, params->node_def, "shape");
2384   params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
2385   return Status::OK();
2386 }
2387 
ExpectShapeTensor(const TRT_TensorOrWeights & tensor)2388 Status ExpectShapeTensor(const TRT_TensorOrWeights& tensor) {
2389   if (tensor.tensor()->getType() != nvinfer1::DataType::kINT32) {
2390     return errors::InvalidArgument("Expected a shape tensor with INT32 type");
2391   }
2392   if (tensor.GetTrtDims().nbDims > 1) {
2393     return errors::InvalidArgument("Expected a 0D or 1D shape tensor");
2394   }
2395   return Status::OK();
2396 }
2397 
2398 // Converts Reshape op if the input has dynamic (unknown) dims.
ConvertDynamicReshape(OpConverterParams * params)2399 Status ConvertDynamicReshape(OpConverterParams* params) {
2400   if (params->use_implicit_batch) {
2401     return errors::InvalidArgument(
2402         "The input \"shape\" for Reshape must be a constant in implicit batch"
2403         " mode, at ",
2404         params->node_def.name());
2405   }
2406   if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) {
2407     // While officially TRT supports shape value input , there are problems with
2408     // shape input handling that cause networks converted with
2409     // ConvertDynamicReshape fail. Here we conservatively switch off the
2410     // converter before TRT 7.1.3.
2411     return errors::InvalidArgument(
2412         "Non constant shape input tensor for Reshape requires minimum TRT "
2413         "7.1.3");
2414   }
2415   const auto& inputs = params->inputs;
2416   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2417 
2418   // If the input is a tensor it must be a shape tensor.
2419   TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1)));
2420   if (inputs.at(1).tensor()->getDimensions().nbDims == 0) {
2421     // Dynamic reshape requires a 1D shape tensor.
2422     return errors::Unimplemented(
2423         "Reshape with dynamic input requires 1D input tensor, at ",
2424         params->node_def.name());
2425   }
2426   if (params->validation_only) return Status::OK();
2427   nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
2428       *input_tensor.tensor()->trt_tensor());
2429   VLOG(2) << "ConvertReshape setInput (1) "
2430           << DebugString(inputs.at(1).tensor()->getDimensions());
2431   layer->setInput(1, *inputs.at(1).tensor()->trt_tensor());
2432   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
2433   return Status::OK();
2434 }
2435 
2436 // Converts Reshape in explicit batch mode if the input has static (known) dims.
ConvertStaticReshapeForExplicitBatchMode(OpConverterParams * params,const int * output_dims,int num_dims,ITensorProxyPtr * output_tensor)2437 Status ConvertStaticReshapeForExplicitBatchMode(
2438     OpConverterParams* params, const int* output_dims, int num_dims,
2439     ITensorProxyPtr* output_tensor) {
2440   nvinfer1::Dims dims;
2441   dims.nbDims = num_dims;
2442   std::copy(output_dims, output_dims + num_dims, dims.d);
2443   return PrepareTensorForShape(params->converter, params->inputs.at(0), dims,
2444                                params->validation_only, output_tensor,
2445                                params->node_def);
2446 }
2447 
2448 // Converts Reshape in implicit batch mode. The input has static (known) dims.
ConvertStaticReshapeForImplicitBatchMode(OpConverterParams * params,const int * output_shape_dims,int output_shape_dims_count,ITensorProxyPtr * output_tensor)2449 Status ConvertStaticReshapeForImplicitBatchMode(
2450     OpConverterParams* params, const int* output_shape_dims,
2451     int output_shape_dims_count, ITensorProxyPtr* output_tensor) {
2452   const auto& inputs = params->inputs;
2453   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2454   const int input_batch_dim = input_tensor.batch_size();
2455   const int output_batch_dim =
2456       (output_shape_dims_count > 0) ? output_shape_dims[0] : 0;
2457 
2458   const nvinfer1::Dims input_nonbatch_dims = input_tensor.GetTrtDims();
2459   nvinfer1::Dims output_nonbatch_dims;
2460   output_nonbatch_dims.nbDims = output_shape_dims_count - 1;
2461   for (int i = 1; i < output_shape_dims_count; i++) {
2462     output_nonbatch_dims.d[i - 1] = output_shape_dims[i];
2463   }
2464 
2465   VLOG(1) << "input_batch_dim=" << input_batch_dim
2466           << ", input_nonbatch_dims=" << DebugString(input_nonbatch_dims)
2467           << "\nresult_batch_dim=" << output_batch_dim
2468           << ", result_nonbatch_dims=" << DebugString(output_nonbatch_dims);
2469 
2470   // Check whether input_batch_dim and output_batch_dim will have the same
2471   // static value.
2472   bool reshape_may_change_batch_dim = false;
2473   if (input_batch_dim != -1 && output_batch_dim != -1) {
2474     reshape_may_change_batch_dim = (input_batch_dim != output_batch_dim);
2475   } else {
2476     reshape_may_change_batch_dim =
2477         !AreDimsStaticWithSameSize(input_nonbatch_dims, output_nonbatch_dims);
2478   }
2479   if (reshape_may_change_batch_dim) {
2480     const string msg =
2481         StrCat("Reshape on batch dimension is not supported, at ",
2482                params->node_def.name(), ". input_batch_dim=", input_batch_dim,
2483                ", ", DebugString(input_nonbatch_dims),
2484                "; output_batch_dim=", output_batch_dim, ", ",
2485                DebugString(output_nonbatch_dims));
2486     return errors::Unimplemented(msg);
2487   }
2488   // Perform the conversion.
2489   return PrepareTensorForShape(params->converter, input_tensor,
2490                                output_nonbatch_dims, params->validation_only,
2491                                output_tensor, params->node_def);
2492 }
2493 
ConvertReshape(OpConverterParams * params)2494 Status ConvertReshape(OpConverterParams* params) {
2495   const auto& inputs = params->inputs;
2496   TF_RETURN_IF_ERROR(CheckInputsWeights(
2497       *params,
2498       {{"tensor", TrtInputArg::kTensor}, {"shape", TrtInputArg::kBoth}}));
2499   TF_RETURN_IF_ERROR(AllowDataTypes(
2500       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2501   if (inputs.at(1).is_tensor()) {
2502     return ConvertDynamicReshape(params);
2503   }
2504 
2505   // TODO(bixia): we can't use inputs.at(1).weights().ToVector<int>() for two
2506   // reasons: (1) When weights.count()==0, TRT_ShapedWeights::tensor_ dtype is
2507   // not properly set to INT32. (2) I tried a fix for the first problem, I got
2508   // shared pointer related error in convert_nodes_test. We should fix the
2509   // problems and switch to use inputs.at(1).weights().ToVector<int>(), a type
2510   // safe method to access the content of the tensor.
2511   TRT_ShapedWeights weights = inputs.at(1).weights();
2512   if (weights.count() == 0 && params->use_implicit_batch) {
2513     return errors::Unimplemented("Reshape to shape=[] is not supported, at ",
2514                                  params->node_def.name());
2515   }
2516 
2517   const int* output_shape_dims = static_cast<int*>(weights.GetValues());
2518   size_t output_shape_dims_count = weights.count();
2519   ITensorProxyPtr output_tensor = nullptr;
2520 
2521   if (!params->use_implicit_batch) {
2522     TF_RETURN_IF_ERROR(ConvertStaticReshapeForExplicitBatchMode(
2523         params, output_shape_dims, output_shape_dims_count, &output_tensor));
2524   } else {
2525     TF_RETURN_IF_ERROR(ConvertStaticReshapeForImplicitBatchMode(
2526         params, output_shape_dims, output_shape_dims_count, &output_tensor));
2527   }
2528   if (params->validation_only) return Status::OK();
2529 
2530   // Record the conversion result.
2531   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2532   return Status::OK();
2533 }
2534 
ConvertExpandDims(OpConverterParams * params)2535 Status ConvertExpandDims(OpConverterParams* params) {
2536   const auto& inputs = params->inputs;
2537   const auto& node_def = params->node_def;
2538   TF_RETURN_IF_ERROR(
2539       CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
2540   TF_RETURN_IF_ERROR(AllowDataTypes(
2541       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2542   // Get input shape as vector.
2543   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2544   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
2545   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2546   // Get axis to expand on.
2547   auto axis = inputs.at(1).weights().GetSpan<int>();
2548   if (axis.size() != 1) {
2549     return errors::InvalidArgument("ExpandDims axis must be a scalar, at ",
2550                                    node_def.name());
2551   }
2552   // Use rank = nbDims + 1 for ConvertAxis's bounds checking to account for
2553   // ExpandDim's ability to add an axis at end of the shape.
2554   int trt_axis;
2555   TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dims.nbDims + 1, node_def.name(),
2556                                  params->use_implicit_batch, &trt_axis));
2557   if (params->validation_only) return Status::OK();
2558   ITensorProxyPtr output_tensor = nullptr;
2559 
2560   if (!params->use_implicit_batch && !HasStaticShape(input_dims)) {
2561     TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
2562         input_tensor.tensor(), dims, trt_axis, params, &output_tensor));
2563   } else {
2564     // ExpandDims: Insert new dim of size 1.
2565     input_dims.insert(input_dims.begin() + trt_axis, 1);
2566     // Reshape tensor.
2567     nvinfer1::Dims new_dims;
2568     TF_RETURN_IF_ERROR(ContainerToTrtDims(input_dims, &new_dims));
2569     TF_RETURN_IF_ERROR(PrepareTensorForShape(
2570         params->converter, input_tensor, new_dims, /*validation_only=*/false,
2571         &output_tensor, params->node_def));
2572   }
2573   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2574   return Status::OK();
2575 }
2576 
DynamicReshape(ITensorProxyPtr input,std::vector<std::pair<int,int>> slices,OpConverterParams * params,ITensorProxyPtr * output,std::vector<int> size_for_added_dims,absl::optional<int> op_instance)2577 Status Converter::DynamicReshape(ITensorProxyPtr input,
2578                                  std::vector<std::pair<int, int>> slices,
2579                                  OpConverterParams* params,
2580                                  ITensorProxyPtr* output,
2581                                  std::vector<int> size_for_added_dims,
2582                                  absl::optional<int> op_instance) {
2583   *output = nullptr;
2584   // DynamicReshape relies on INetworkDefinition::addShape
2585   if (params->validation_only) {
2586     return errors::Internal(
2587         "DynamicReshape should not be used during validation");
2588   }
2589   ITensorProxyPtr shape =
2590       network()->addShape(*input->trt_tensor())->getOutput(0);
2591   // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:]
2592   std::vector<ITensorProxyPtr> concat_inputs;
2593   int max_num_slices = std::max(slices.size(), size_for_added_dims.size());
2594   int op_instance_value = op_instance.has_value() ? op_instance.value() : 0;
2595   for (int i = 0; i < max_num_slices; i++) {
2596     ITensorProxyPtr tensor;
2597     int slice_instance = i * max_num_slices + op_instance_value;
2598     // maybe_add_a_dimension(i);
2599     if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) {
2600       nvinfer1::Dims dims{1, {1}};
2601       if (size_for_added_dims[i] > 0) {
2602         dims.d[0] = size_for_added_dims[i];
2603       }
2604       TF_RETURN_IF_ERROR(
2605           CreateScalarConstant(params, std::min(size_for_added_dims[i], 1),
2606                                &tensor, nvinfer1::DataType::kINT32, dims));
2607       concat_inputs.push_back(tensor);
2608     }
2609     if (i < slices.size()) {
2610       nvinfer1::ISliceLayer* slice_layer = network()->addSlice(
2611           *shape->trt_tensor(), {1, {slices[i].first}},
2612           {1, {slices[i].second - slices[i].first}}, {1, {1}});
2613       concat_inputs.push_back(slice_layer->getOutput(0));
2614       SetLayerName(slice_layer, params->node_def, "slice", slice_instance);
2615     }
2616   }
2617   std::vector<nvinfer1::ITensor*> trt_concat_inputs;
2618   for (const auto& t : concat_inputs) {
2619     trt_concat_inputs.push_back(t->trt_tensor());
2620   }
2621   nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation(
2622       static_cast<nvinfer1::ITensor* const*>(trt_concat_inputs.data()),
2623       concat_inputs.size());
2624   SetLayerName(concat_layer, params->node_def, "concat", op_instance);
2625   concat_layer->setAxis(0);
2626   ITensorProxyPtr new_shape = concat_layer->getOutput(0);
2627   // Reshape input using new shape
2628   nvinfer1::IShuffleLayer* shuffle =
2629       network()->addShuffle(*input->trt_tensor());
2630   SetLayerName(shuffle, params->node_def, "shuffle", op_instance);
2631   shuffle->setInput(1, *new_shape->trt_tensor());
2632   *output = shuffle->getOutput(0);
2633   return Status::OK();
2634 }
2635 
DynamicExpandDims(ITensorProxyPtr input,const nvinfer1::Dims & dims,int axis,OpConverterParams * params,ITensorProxyPtr * output,absl::optional<int> op_instance)2636 Status Converter::DynamicExpandDims(ITensorProxyPtr input,
2637                                     const nvinfer1::Dims& dims, int axis,
2638                                     OpConverterParams* params,
2639                                     ITensorProxyPtr* output,
2640                                     absl::optional<int> op_instance) {
2641   if (params->validation_only) {
2642     *output = nullptr;
2643     return errors::Internal(
2644         "DynamicExpandDims should not be used during validation");
2645   }
2646   std::vector<std::pair<int, int>> slices;
2647   std::vector<int> extra_dims;
2648   if (axis != 0) {
2649     slices.push_back(std::pair<int, int>{0, axis});
2650     extra_dims.push_back(-1);
2651   }
2652   extra_dims.push_back(1);
2653   if (axis != dims.nbDims) {
2654     slices.push_back(std::pair<int, int>{axis, dims.nbDims});
2655   }
2656   return DynamicReshape(input, slices, params, output, extra_dims, op_instance);
2657 }
2658 
SqueezeTensor(ITensorProxyPtr input,std::vector<int> * input_dims,OpConverterParams * params,ITensorProxyPtr * output)2659 Status Converter::SqueezeTensor(ITensorProxyPtr input,
2660                                 std::vector<int>* input_dims,
2661                                 OpConverterParams* params,
2662                                 ITensorProxyPtr* output) {
2663   // If the remaining dimensions of a squeeze operation have dynamic sizes, we
2664   // need to use TRT ops to build the result shape for the squeeze operation.
2665   // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
2666   // value.
2667   if (!params->use_implicit_batch && !HasStaticShape(*input_dims)) {
2668     std::vector<std::pair<int, int>> slices;
2669     for (int i = 0; i < input_dims->size(); i++) {
2670       if (input_dims->at(i) != 0) {
2671         slices.push_back(std::pair<int, int>(i, i + 1));
2672       }
2673     }
2674     return DynamicReshape(input, slices, params, output);
2675   }
2676   // Remove all dims which are equal to 0.
2677   input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
2678                     input_dims->end());
2679   // Reshape tensor.
2680   nvinfer1::Dims new_dims;
2681   VLOG(2) << "input_dims: " << input_dims;
2682   TF_RETURN_IF_ERROR(ContainerToTrtDims(*input_dims, &new_dims));
2683   TF_RETURN_IF_ERROR(PrepareTensorForShape(
2684       params->converter, TRT_TensorOrWeights(input), new_dims,
2685       /*validation_only=*/false, output, params->node_def));
2686   return Status::OK();
2687 }
2688 
ConvertSqueeze(OpConverterParams * params)2689 Status ConvertSqueeze(OpConverterParams* params) {
2690   const auto& inputs = params->inputs;
2691   const auto& node_def = params->node_def;
2692   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
2693   TF_RETURN_IF_ERROR(AllowDataTypes(
2694       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2695   // Get input shape.
2696   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
2697   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
2698   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2699   TFAttrs attrs(node_def);
2700   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
2701   if (squeeze_dims.empty()) {
2702     if (params->use_implicit_batch || !HasStaticShape(dims)) {
2703       return errors::Unimplemented(
2704           "Squeeze is not implemented for empty squeeze_dims, at ",
2705           node_def.name());
2706     } else {
2707       // explicit batch mode with static input shape we squeeze all singleton
2708       // dimensions
2709       for (int& dim : input_dims) {
2710         if (dim == 1) {
2711           // Mark it for removal by setting it to 0
2712           dim = 0;
2713         }
2714       }
2715     }
2716   } else {
2717     std::vector<int> trt_axes;
2718     trt_axes.reserve(squeeze_dims.size());
2719     for (int tf_axis : squeeze_dims) {
2720       // If the axis is valid, then convert it to TRT axis, otherwise abort
2721       // conversion.
2722       int trt_axis;
2723       TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
2724                                      params->use_implicit_batch, &trt_axis));
2725       // Make sure target dimension is size 1 or unknown size (-1)
2726       if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
2727         return errors::InvalidArgument(
2728             "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
2729             " cannot be squeezed because it must be size 1, at ",
2730             node_def.name());
2731       }
2732       trt_axes.push_back(trt_axis);
2733     }
2734     // Mark axes to remove by setting them to 0.
2735     for (int axis : trt_axes) {
2736       input_dims[axis] = 0;
2737     }
2738   }
2739   if (params->validation_only) return Status::OK();
2740 
2741   ITensorProxyPtr output_tensor = nullptr;
2742   TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
2743       input_tensor.tensor(), &input_dims, params, &output_tensor));
2744   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
2745   return Status::OK();
2746 }
2747 
2748 template <typename Container>
ConvertStridedSliceHelper(OpConverterParams * params,const TRT_TensorOrWeights & input,Container begin,Container size,const Container & stride,const nvinfer1::Dims * final_shape=nullptr,absl::optional<int> op_instance=absl::nullopt)2749 Status ConvertStridedSliceHelper(
2750     OpConverterParams* params, const TRT_TensorOrWeights& input,
2751     Container begin, Container size, const Container& stride,
2752     const nvinfer1::Dims* final_shape = nullptr,
2753     absl::optional<int> op_instance = absl::nullopt) {
2754   if (!params->use_implicit_batch &&
2755       (!HasStaticShape(begin) || !HasStaticShape(size))) {
2756     return errors::Unimplemented(
2757         "Strided slice op not implemented for dynamic shape input");
2758   }
2759   const auto& node_def = params->node_def;
2760   // Get input dims.
2761   nvinfer1::Dims dims = input.GetTrtDims();
2762   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2763   if (params->use_implicit_batch) {
2764     // Begin, size and stride does include explicit batch dim. Add batch
2765     // dimension to input_dims so that indexes line up properly.
2766     input_dims.insert(input_dims.begin(), -1);
2767   }
2768   // Check bounds.
2769   for (int i = 1; i < input_dims.size(); i++) {
2770     if (input_dims[i] < 0 || size[i] < 0) continue;
2771     if (begin[i] < 0 || begin[i] > input_dims[i]) {
2772       return errors::InvalidArgument("\"begin\" for dimension ",
2773                                      std::to_string(i), " in ", node_def.op(),
2774                                      " is out of range, at ", node_def.name());
2775     }
2776     int end = begin[i];
2777     if (size[i] > 0) end += (size[i] - 1) * stride[i];
2778     if (end < 0 || end > input_dims[i]) {
2779       return errors::InvalidArgument("\"begin\" + \"size\" for dimension ",
2780                                      std::to_string(i), " in ", node_def.op(),
2781                                      " is out of range, at ", node_def.name());
2782     }
2783   }
2784 
2785   nvinfer1::Dims begin_dims, size_dims, stride_dims;
2786   TF_RETURN_IF_ERROR(
2787       ContainerToTrtDims(begin, &begin_dims,
2788                          /*ignore_first_dim=*/params->use_implicit_batch));
2789   TF_RETURN_IF_ERROR(
2790       ContainerToTrtDims(size, &size_dims,
2791                          /*ignore_first_dim=*/params->use_implicit_batch));
2792   TF_RETURN_IF_ERROR(
2793       ContainerToTrtDims(stride, &stride_dims, params->use_implicit_batch));
2794   if (params->validation_only) return Status::OK();
2795 
2796   VLOG(2) << "Adding slice layer with begin=" << DebugString(begin_dims)
2797           << ", size=" << DebugString(size_dims)
2798           << ", stride=" << DebugString(stride_dims);
2799   nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
2800       *input.tensor()->trt_tensor(), begin_dims, size_dims, stride_dims);
2801   params->converter->SetLayerName(layer, params->node_def, "slice",
2802                                   op_instance);
2803   ITensorProxyPtr tensor = layer->getOutput(0);
2804   // Reshape for shrink_axis.
2805   if (final_shape) {
2806     TF_RETURN_IF_ERROR(PrepareTensorForShape(
2807         params->converter, TRT_TensorOrWeights(tensor), *final_shape,
2808         /*validation_only=*/false, &tensor, node_def, op_instance));
2809   }
2810   params->outputs->push_back(TRT_TensorOrWeights(tensor));
2811   return Status::OK();
2812 }
2813 
ConvertSlice(OpConverterParams * params)2814 Status ConvertSlice(OpConverterParams* params) {
2815   const auto& inputs = params->inputs;
2816   const auto& node_def = params->node_def;
2817   TF_RETURN_IF_ERROR(CheckInputsWeights(
2818       *params, {{"input", false}, {"begin", true}, {"size", true}}));
2819   TF_RETURN_IF_ERROR(AllowDataTypes(
2820       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2821   std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
2822   std::vector<int> size = inputs.at(2).weights().ToVector<int>();
2823   // Get input dims.
2824   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
2825   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
2826   // Add batch dimension so that indexes line up properly.
2827   if (params->use_implicit_batch) {
2828     input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
2829   }
2830   if (!AllLengthsEqual({input_dims, begin, size})) {
2831     return errors::InvalidArgument(
2832         "Length of begin and size arguments must equal rank of input for "
2833         "Slice, at ",
2834         node_def.name());
2835   }
2836   // Check that batch dimension is unmodified.
2837   if (params->use_implicit_batch) {
2838     const bool begin_is_modified = begin[0] != 0;
2839     // If size[0]s is not -1, we can only know if the batch dimension is
2840     // unmodified when the batch size is defined. When the batch size is
2841     // undefined, we don't convert to be safe.
2842     const bool size_is_unchanged = size[0] == -1 || size[0] == input_dims[0];
2843     if (begin_is_modified || !size_is_unchanged) {
2844       return errors::Unimplemented(
2845           "TensorRT does not allow modifications to the batch dimension, at ",
2846           node_def.name());
2847     }
2848   }
2849   // Size of -1 signifies to take all remaining elements.
2850   for (int i = 0; i < input_dims.size(); i++) {
2851     if (size[i] == -1) {
2852       if (input_dims[i] == -1) {
2853         return errors::Unimplemented(
2854             "Input dims must be defined for size = -1, at ", node_def.name());
2855       }
2856       size[i] = input_dims[i] - begin[i];
2857     } else if (size[i] < -1) {
2858       return errors::InvalidArgument("Invalid size value at ", node_def.name());
2859     }
2860     if (input_dims[i] != -1 && (begin[i] < 0 || begin[i] > input_dims[i])) {
2861       return errors::InvalidArgument("\"begin\" for dimension ",
2862                                      std::to_string(i), " in ", node_def.op(),
2863                                      " is out of range, at ", node_def.name());
2864     }
2865     const int end = begin[i] + size[i];
2866     if (input_dims[i] != -1 && (end < 0 || end > input_dims[i])) {
2867       return errors::InvalidArgument("\"begin\" + \"size\" for dimension ",
2868                                      std::to_string(i), " in ", node_def.op(),
2869                                      " is out of range, at ", node_def.name());
2870     }
2871   }
2872   // Stride is 1 for all dims.
2873   std::vector<int> stride(begin.size(), 1);
2874   return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
2875 }
2876 
ConvertStridedSlice(OpConverterParams * params)2877 Status ConvertStridedSlice(OpConverterParams* params) {
2878   const auto& inputs = params->inputs;
2879   const auto& node_def = params->node_def;
2880   // The TF op allows negative begin/end indices while TRT requires values
2881   // within bounds. This is because we use the the default slice mode
2882   // (see ISliceLayer::SetMode) with TRT: "Fail with error when the coordinates
2883   // are out of bounds". If begin/end tensors have negative values then we map
2884   // them to positive vales. The way this is currently implemented requires that
2885   // begin / end are constants, therefore we allow only weighs for begin / end.
2886   //
2887   // The output size is determined by begin, end and strides. For shape tensors
2888   // TRT requires that the output size is known at engine construction time. To
2889   // reduce complexity of the converter, we also require constant size for non
2890   // shape input. This implies that the stride input also have to be a constant
2891   // (weights).
2892   TF_RETURN_IF_ERROR(CheckInputsWeights(
2893       *params,
2894       {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
2895   TF_RETURN_IF_ERROR(AllowDataTypes(
2896       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
2897 
2898   // TODO(tfeher): Enable dynamic shape input.
2899   if (!HasStaticShape(inputs.at(0).GetTrtDims())) {
2900     return errors::Unimplemented(
2901         "Strided slice op not implemented for dynamic shape input");
2902   }
2903   TFAttrs attrs(node_def);
2904   // New_axis_mask is not supported. TODO(tfeher): Support this by expanddims.
2905   const int32 new_axis_mask = attrs.get<int64>("new_axis_mask");
2906   if (new_axis_mask != 0) {
2907     return errors::Unimplemented(
2908         "new_axis_mask is not supported for StridedSlice, at ",
2909         node_def.name());
2910   }
2911   const int32 begin_mask = attrs.get<int64>("begin_mask");
2912   const int32 end_mask = attrs.get<int64>("end_mask");
2913   const int32 ellipsis_mask = attrs.get<int64>("ellipsis_mask");
2914   const int32 shrink_axis_mask = attrs.get<int64>("shrink_axis_mask");
2915 
2916   // Get input dims.
2917   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
2918   std::vector<int64> input_dims(dims.d, dims.d + dims.nbDims);
2919   // Add batch dimension so that indexes line up properly. Set it to -1 if it's
2920   // unknown, so ValidateStridedSliceOp() can handle it correctly below.
2921   if (params->use_implicit_batch) {
2922     input_dims.insert(input_dims.begin(),
2923                       std::max(-1, inputs.at(0).batch_size()));
2924   }
2925 
2926   const TRT_ShapedWeights& begin_weights = inputs.at(1).weights();
2927   const TRT_ShapedWeights& end_weights = inputs.at(2).weights();
2928   const TRT_ShapedWeights& stride_weights = inputs.at(3).weights();
2929   if (!AllLengthsEqual({begin_weights.ToVector<int>(),
2930                         end_weights.ToVector<int>(),
2931                         stride_weights.ToVector<int>()})) {
2932     return errors::InvalidArgument(
2933         "Length of begin, end, and stride must be equal, at ", node_def.name());
2934   }
2935 
2936   // The slice op has many ways to define the actual operation that needs to be
2937   // performed. We use ValidateStridedSliceOp to map the input parameters to
2938   // begin, end, & strides. ValidateStridedSliceOp makes an effort to set known
2939   // (static) begin/end/strides parameters. On return, begin, end, stride,
2940   // processing_shape has the same rank as input. final_shape has extra dims
2941   // added/removed. Negative values in begin/end/stride are converted to
2942   // positive values to produce a known processing_shape if the input shape is
2943   // static. Otherwise, processing_shape and final_shape may contain unknown
2944   // dimension values.
2945   PartialTensorShape input_shape(input_dims);
2946   PartialTensorShape processing_shape;
2947   PartialTensorShape final_shape;
2948   bool is_identity;
2949   bool is_simple_slice;
2950   bool slice_dim0;
2951   absl::InlinedVector<int64, 4> begin;
2952   absl::InlinedVector<int64, 4> end;
2953   absl::InlinedVector<int64, 4> strides;
2954   TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
2955       &begin_weights.GetTensor(), &end_weights.GetTensor(),
2956       stride_weights.GetTensor(), input_shape, begin_mask, end_mask,
2957       ellipsis_mask, new_axis_mask, shrink_axis_mask, &processing_shape,
2958       &final_shape, &is_identity, &is_simple_slice, &slice_dim0, &begin, &end,
2959       &strides));
2960 
2961   // If batch dimension is covered by the ellipsis mask, it means it's left
2962   // untouched. Otherwise we check whether it modifies the batch dimension here.
2963   if (params->use_implicit_batch &&
2964       (!(ellipsis_mask & 1) ||
2965        begin_weights.shape_.nbDims >= input_dims.size())) {
2966     // Check that batch dimension is unmodified. We need to use the expanded
2967     // begin/end/strides array since the original array may be incorrect when
2968     // (ellipsis_mask&1)==1.
2969     const bool begin_is_modified = !(begin_mask & 1) && (begin[0] != 0);
2970     const bool stride_is_modified = (strides[0] != 1);
2971     // If the batch size is -1 and the end mask is not set, we can only know if
2972     // the batch dimension is unmodified when the batch size is defined. When
2973     // the batch size is undefined, we don't convert to be safe.
2974     const bool batch_size_is_defined = (input_dims[0] > 0);
2975     const bool end_is_modified =
2976         !(end_mask & 1) && (!batch_size_is_defined ||
2977                             (batch_size_is_defined && end[0] != input_dims[0]));
2978     if (begin_is_modified || stride_is_modified || end_is_modified) {
2979       return errors::Unimplemented(
2980           "TensorRT does not allow modifications to the batch dimension, at ",
2981           node_def.name());
2982     }
2983   }
2984   // Can't shrink axis on batch dimension.
2985   if (params->use_implicit_batch && shrink_axis_mask & 1) {
2986     return errors::Unimplemented(
2987         "TensorRT does not allow modifications to the batch dimension, at ",
2988         node_def.name());
2989   }
2990 
2991   // TRT Slice layer uses (begin, size) instead of (begin, end). We calculate
2992   // the size if possible, otherwise we set it to -1.
2993   absl::InlinedVector<int64, 4> size(input_dims.size());
2994   for (int i = 0; i < input_dims.size(); i++) {
2995     if (input_dims[i] < 0) {
2996       // Often begin[i] and end[i] could be used to calculate the size.
2997       // (Although the presence of begin/end manks make it non-trivial beacues
2998       // 0 value might indicate that a mask was used). But the size has to be
2999       // clamped to match the array size, for which we need to use the dynamic
3000       // version of the helper routines. Therefore we set size to -1,
3001       // which will select the dynamic shape helper (to be implemented).
3002       size[i] = -1;
3003       continue;
3004     }
3005     // Divide by stride (round up).
3006     size[i] = strides[i] > 0
3007                   ? (end[i] - begin[i] + strides[i] - 1) / strides[i]
3008                   : (begin[i] - end[i] + abs(strides[i]) - 1) / abs(strides[i]);
3009     if (size[i] < 0) {
3010       return errors::InvalidArgument(
3011           "\"size\" cannot be negative for StridedSlice");
3012     }
3013   }
3014 
3015   // shrink_axis_mask requires a reshape after the slice.
3016   nvinfer1::Dims final_shape_dims;
3017   nvinfer1::Dims* final_shape_dims_ptr = nullptr;
3018   if (shrink_axis_mask) {
3019     TF_RETURN_IF_ERROR(TensorShapeToTrtDims(
3020         final_shape, /*ignore_first_dim=*/params->use_implicit_batch,
3021         &final_shape_dims));
3022     final_shape_dims_ptr = &final_shape_dims;
3023   }
3024 
3025   return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, strides,
3026                                    final_shape_dims_ptr, 0);
3027 }
3028 
ConvertConv2D(OpConverterParams * params)3029 Status ConvertConv2D(OpConverterParams* params) {
3030   return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false);
3031 }
3032 
ConvertConv2DDepthwise(OpConverterParams * params)3033 Status ConvertConv2DDepthwise(OpConverterParams* params) {
3034   return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false);
3035 }
3036 
ConvertConv2DBackpropInput(OpConverterParams * params)3037 Status ConvertConv2DBackpropInput(OpConverterParams* params) {
3038   return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
3039 }
3040 
ConvertConv3DHelper(OpConverterParams * params,int group,bool is_conv3d_backprop_input=false)3041 Status ConvertConv3DHelper(OpConverterParams* params, int group,
3042                            bool is_conv3d_backprop_input = false) {
3043   const int kNumDims = 5;
3044   const auto& inputs = params->inputs;
3045   const auto& node_def = params->node_def;
3046   TRT_TensorOrWeights backprop_output_size;
3047   ITensorProxyPtr tensor = nullptr;
3048   if (is_conv3d_backprop_input) {
3049     // In the case when Conv3dBackpropInput is used for conv3d_transpose, these
3050     // inputs correspond to: output size, filter, and input.
3051     TF_RETURN_IF_ERROR(CheckInputsWeights(
3052         *params,
3053         {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
3054     backprop_output_size = inputs.at(0);
3055     tensor = inputs.at(2).tensor();
3056   } else {
3057     TF_RETURN_IF_ERROR(
3058         CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
3059     tensor = inputs.at(0).tensor();
3060   }
3061   TF_RETURN_IF_ERROR(
3062       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3063   const TRT_ShapedWeights weights_drsck = inputs.at(1).weights();
3064   if (weights_drsck.shape_.nbDims != kNumDims) {
3065     return errors::InvalidArgument("Conv3D expects kernel of dimension 5, at ",
3066                                    node_def.name());
3067   }
3068   TFAttrs attrs(node_def);
3069   auto data_format = attrs.get<string>("data_format");
3070   const bool is_ndhwc = (data_format == "NDHWC");  // Or NCDHW 01234 - > 02341
3071   const int d_index = is_ndhwc ? 1 : 2;
3072   const int h_index = is_ndhwc ? 2 : 3;
3073   const int w_index = is_ndhwc ? 3 : 4;
3074   const int c_index = is_ndhwc ? 4 : 1;
3075   auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
3076   if (tf_dilations.size() != kNumDims) {
3077     return errors::InvalidArgument(
3078         "Convolution dilations field must specify 5 dimensions, at ",
3079         node_def.name());
3080   }
3081   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
3082     return errors::Unimplemented(
3083         "Dilation rate must be 1 for batch and channel dimensions, at ",
3084         node_def.name());
3085   }
3086 
3087   const nvinfer1::Dims3 dilation_dhw(
3088       tf_dilations[d_index], tf_dilations[h_index], tf_dilations[w_index]);
3089   if (is_conv3d_backprop_input &&
3090       (dilation_dhw.d[0] != 1 || dilation_dhw.d[1] != 1 ||
3091        dilation_dhw.d[2] != 1)) {
3092     return errors::Unimplemented(
3093         "Dilation with Conv3DBackpropInputV2 (conv3d_transpose) is not "
3094         "supported",
3095         ", at ", node_def.name());
3096   }
3097 
3098   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3099   if (tf_stride.size() != kNumDims) {
3100     return errors::InvalidArgument(
3101         "Convolution strides field must specify 5 dimensions, at ",
3102         node_def.name());
3103   }
3104   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3105     return errors::Unimplemented(
3106         "Stride must be 1 for batch and channel dimensions, at ",
3107         node_def.name());
3108   }
3109 
3110   const nvinfer1::Dims3 stride_dhw(tf_stride[d_index], tf_stride[h_index],
3111                                    tf_stride[w_index]);
3112   const auto tensor_dim = tensor->getDimensions();
3113 
3114   // Asymmetric padding on Deconv not supported for now
3115   if (is_conv3d_backprop_input && attrs.get<string>("padding") == "SAME") {
3116     TRT_ShapedWeights weights =
3117         params->weight_store->GetTempWeights(weights_drsck);
3118 
3119     nvinfer1::Dims3 effective_kernel_size(
3120         weights.shape_.d[0] +
3121             (weights.shape_.d[0] - 1) * (dilation_dhw.d[0] - 1),  // D
3122         weights.shape_.d[1] +
3123             (weights.shape_.d[1] - 1) * (dilation_dhw.d[1] - 1),  // R
3124         weights.shape_.d[2] +
3125             (weights.shape_.d[2] - 1) * (dilation_dhw.d[2] - 1)  // S
3126     );
3127 
3128     const auto output_size_weights =
3129         static_cast<int*>(backprop_output_size.weights().GetValues());
3130     const std::vector<int64_t> input_dims = {output_size_weights[d_index],
3131                                              output_size_weights[h_index],
3132                                              output_size_weights[w_index]};
3133 
3134     const std::vector<std::pair<int, int>> padding =
3135         CreateSamePadding(stride_dhw, effective_kernel_size, input_dims);
3136 
3137     if (padding[0].first != padding[0].second ||
3138         padding[1].first != padding[1].second ||
3139         padding[2].first != padding[2].second) {
3140       return errors::Unimplemented(
3141           "Asymmetric padding with Conv3DBackpropInputV2 (conv3d_transpose) is "
3142           "not supported, at ",
3143           node_def.name());
3144     }
3145   }
3146 
3147   // Channel dim must be static for Conv3D since we use that value for
3148   // num_groups at build time.
3149   // TODO: Allow conversion if kImplicitBatchModeCompatible||kOptimal is used.
3150   int implicit_batch_offset = params->use_implicit_batch ? -1 : 0;
3151   if (tensor->getDimensions().d[c_index + implicit_batch_offset] == -1) {
3152     return errors::InvalidArgument("Channel dimension must be static, at ",
3153                                    node_def.name());
3154   }
3155 
3156   // Finished validation checks
3157   if (params->validation_only) return Status::OK();
3158 
3159   // Transpose to NCDHW (NCDHW is required for IConvLayer).
3160   const bool need_transpose = is_ndhwc;
3161   if (need_transpose) {
3162     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3163         tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
3164   }
3165 
3166   // group == 0 signifies that this is a depthwise convolution, so set
3167   // num_groups to size of input's channel dim. For a non-depthwise conv,
3168   // num_groups will be 1.
3169   const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
3170 
3171   // For conv, TF weights are DRSCK, and TRT expects KCDRS.
3172   // For backprop, TF weights are DRSKC, and TRT expects KCDRS.
3173   // Therefore, this reorder will work for both cases.
3174   TRT_ShapedWeights weights =
3175       params->weight_store->GetTempWeights(weights_drsck);
3176   ReorderDRSCKToKCDRS(weights_drsck, &weights, num_groups);
3177   TRT_ShapedWeights biases(weights.TrtDType());
3178   const int output_axis = is_conv3d_backprop_input ? 1 : 0;
3179   const int noutput = weights.shape_.d[output_axis] * num_groups;
3180   nvinfer1::Dims3 kernel_size_drs(weights.shape_.d[2],  // D
3181                                   weights.shape_.d[3],  // R
3182                                   weights.shape_.d[4]   // S
3183   );
3184 
3185   // Add convolution.
3186   nvinfer1::ILayer* conv_layer = nullptr;
3187   if (is_conv3d_backprop_input) {
3188     nvinfer1::IDeconvolutionLayer* layer =
3189         params->converter->network()->addDeconvolutionNd(
3190             *tensor->trt_tensor(), noutput, kernel_size_drs,
3191             weights.GetTrtWeights(), biases.GetTrtWeights());
3192     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3193     layer->setStrideNd(stride_dhw);  // change to nd set stride
3194 
3195     if (attrs.get<string>("padding") == "SAME") {
3196       VLOG(2) << "Using SAME padding";
3197       // SAME_UPPER means that post padding is preferred.
3198       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3199     }
3200 
3201     layer->setNbGroups(num_groups);
3202     conv_layer = layer;
3203   } else {
3204     nvinfer1::IConvolutionLayer* layer =
3205         params->converter->network()->addConvolutionNd(
3206             *tensor->trt_tensor(), noutput, kernel_size_drs,
3207             weights.GetTrtWeights(), biases.GetTrtWeights());
3208     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3209     layer->setStrideNd(stride_dhw);
3210 
3211     if (attrs.get<string>("padding") == "SAME") {
3212       VLOG(2) << "Using SAME padding";
3213       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3214     }
3215 
3216     layer->setNbGroups(num_groups);
3217     layer->setDilationNd(dilation_dhw);
3218     conv_layer = layer;
3219   }
3220   params->converter->SetLayerName(conv_layer, node_def, "conv");
3221   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
3222 
3223   // Restore transpose.
3224   if (need_transpose) {
3225     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3226         output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
3227   }
3228   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3229   return Status::OK();
3230 }
3231 
ConvertConv3D(OpConverterParams * params)3232 Status ConvertConv3D(OpConverterParams* params) {
3233   return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/false);
3234 }
3235 
ConvertConv3DBackpropInputV2(OpConverterParams * params)3236 Status ConvertConv3DBackpropInputV2(OpConverterParams* params) {
3237   return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/true);
3238 }
3239 
ConvertPool3D(OpConverterParams * params)3240 Status ConvertPool3D(OpConverterParams* params) {
3241   const int kNumDims = 5;
3242   const auto& inputs = params->inputs;
3243   const auto& node_def = params->node_def;
3244   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3245   TF_RETURN_IF_ERROR(
3246       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3247   nvinfer1::PoolingType type;
3248   if (node_def.op() == "MaxPool3D") {
3249     type = nvinfer1::PoolingType::kMAX;
3250   } else if (node_def.op() == "AvgPool3D") {
3251     type = nvinfer1::PoolingType::kAVERAGE;
3252   } else {
3253     return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
3254                                  ", at ", node_def.name());
3255   }
3256   TFAttrs attrs(node_def);
3257   const string padding_type = attrs.get<string>("padding");
3258   if ((padding_type != "SAME") && (padding_type != "VALID")) {
3259     return errors::Unimplemented("Unsupported padding type: ", padding_type,
3260                                  ", at ", node_def.name());
3261   }
3262   const auto data_format = attrs.get<string>("data_format");
3263   const bool is_ndhwc = (data_format == "NDHWC");
3264   const int c_index = is_ndhwc ? 4 : 1;
3265   const int d_index = is_ndhwc ? 1 : 2;
3266   const int h_index = is_ndhwc ? 2 : 3;
3267   const int w_index = is_ndhwc ? 3 : 4;
3268   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3269   if (tf_stride.size() != kNumDims) {
3270     return errors::InvalidArgument(
3271         "Pooling strides field must specify 5 dimensions, at ",
3272         node_def.name());
3273   }
3274   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3275     return errors::Unimplemented(
3276         "stride must be 1 for batch and channel dimensions, at ",
3277         node_def.name());
3278   }
3279   const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
3280   if (tf_kernel.size() != kNumDims) {
3281     return errors::InvalidArgument(
3282         "Pooling ksize field must specify 5 dimensions, at ", node_def.name());
3283   }
3284   if (tf_kernel[0] != 1 || tf_kernel[c_index] != 1) {
3285     return errors::Unimplemented(
3286         "ksize must be 1 for batch and channel dimensions, at ",
3287         node_def.name());
3288   }
3289   if (params->validation_only) return Status::OK();
3290 
3291   ITensorProxyPtr tensor = inputs.at(0).tensor();
3292   if (data_format == "NDHWC") {
3293     // NDHWC => NCDHW
3294     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3295         tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
3296   }
3297 
3298   const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index],
3299                                tf_stride[w_index]);
3300   const nvinfer1::Dims3 ksize(tf_kernel[d_index], tf_kernel[h_index],
3301                               tf_kernel[w_index]);
3302 
3303   nvinfer1::IPoolingLayer* layer = params->converter->network()->addPoolingNd(
3304       *tensor->trt_tensor(), type, ksize);
3305   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3306 
3307   layer->setStrideNd(stride);
3308   // VALID padding is the default TRT behavior.
3309   if (padding_type == "SAME") {
3310     // SAME_UPPER means that post padding is preferred.
3311     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3312   }
3313   params->converter->SetLayerName(layer, node_def, "pooling");
3314 
3315   ITensorProxyPtr output_tensor = layer->getOutput(0);
3316   if (data_format == "NDHWC") {
3317     // NCDHW => NDHWC
3318     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3319         output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
3320   }
3321 
3322   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3323   return Status::OK();
3324 }
3325 
ConvertFusedConv2DBiasActivation(OpConverterParams * params)3326 Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
3327   const auto& inputs = params->inputs;
3328   const auto& node_def = params->node_def;
3329 
3330   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
3331                                                   {"filter", true},
3332                                                   {"bias", true},
3333                                                   {"side_input", true},
3334                                                   {"conv_input_scale", true},
3335                                                   {"side_input_scale", true}}));
3336   ITensorProxyPtr tensor = inputs.at(0).tensor();
3337   TF_RETURN_IF_ERROR(
3338       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3339   TRT_ShapedWeights weights = inputs.at(1).weights();
3340   if (weights.shape_.nbDims != 4) {
3341     return errors::InvalidArgument(
3342         "FusedConv2DBiasActivation expects kernel of dimension 4, at " +
3343         node_def.name());
3344   }
3345   TFAttrs attrs(node_def);
3346   auto data_format = attrs.get<string>("data_format");
3347   if (data_format != "NHWC" && data_format != "NCHW") {
3348     return errors::InvalidArgument("Unsupported data_format:", data_format,
3349                                    " at ", node_def.name());
3350   }
3351 
3352   int c_index = (data_format == "NHWC") ? 3 : 1;
3353   int h_index = (data_format == "NHWC") ? 1 : 2;
3354   int w_index = (data_format == "NHWC") ? 2 : 3;
3355   auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
3356   if (tf_dilations.size() != 4) {
3357     return errors::InvalidArgument(
3358         "Convolution dilations field must specify 4 dimensions, at ",
3359         node_def.name());
3360   }
3361   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
3362     return errors::Unimplemented(
3363         "Dilation rate must be 1 for batch and channel dimensions, at ",
3364         node_def.name());
3365   }
3366   const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
3367 
3368   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3369   if (tf_stride.size() != 4) {
3370     return errors::InvalidArgument(
3371         "Convolution strides field must specify 4 dimensions, at ",
3372         node_def.name());
3373   }
3374   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
3375     return errors::Unimplemented(
3376         "Stride must be 1 for batch and channel dimensions, at ",
3377         node_def.name());
3378   }
3379   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
3380   const auto activation_mode = attrs.get<string>("activation_mode");
3381   auto op_pair = ActivationTypeMap()->find(activation_mode);
3382   if (op_pair == ActivationTypeMap()->end() && activation_mode != "None") {
3383     return errors::Unimplemented("Activation mode: ", activation_mode,
3384                                  " not supported at: ", node_def.name());
3385   }
3386 
3387   const auto filter_format = attrs.get<string>("filter_format");
3388   if (filter_format != "HWIO" && filter_format != "OIHW") {
3389     return errors::InvalidArgument("Unsupported filter_format:", filter_format,
3390                                    " at ", node_def.name());
3391   }
3392   // Check that there's no side_input or conv_input_scale.
3393   TRT_ShapedWeights side_input = inputs.at(3).weights();
3394   if (side_input.count() != 0) {
3395     return errors::InvalidArgument(
3396         "FusedConv2DBiasActivation doesn't yet support side_input, at " +
3397         node_def.name());
3398   }
3399   TRT_ShapedWeights conv_input_scale = inputs.at(4).weights();
3400   if (conv_input_scale.count() != 1 ||
3401       conv_input_scale.TrtDType() != nvinfer1::DataType::kFLOAT ||
3402       conv_input_scale.GetSpan<float>()[0] != 1.0) {
3403     return errors::InvalidArgument(
3404         "FusedConv2DBiasActivation doesn't yet support conv_input_scale, at " +
3405         node_def.name());
3406   }
3407   if (params->validation_only) return Status::OK();
3408 
3409   // Transpose to NCHW (NCHW is required for IConvLayer).
3410   const bool need_transpose = (data_format == "NHWC");
3411   if (need_transpose) {
3412     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3413         tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
3414   }
3415 
3416   nvinfer1::DimsHW kernel_size;
3417   if (filter_format == "OIHW") {
3418     kernel_size.h() = weights.shape_.d[2];
3419     kernel_size.w() = weights.shape_.d[3];
3420   } else {
3421     // HWIO.
3422     DCHECK_EQ(filter_format, "HWIO");
3423     kernel_size.h() = weights.shape_.d[0];
3424     kernel_size.w() = weights.shape_.d[1];
3425   }
3426 
3427   // Add convolution.
3428   TRT_ShapedWeights biases = inputs.at(2).weights();
3429   nvinfer1::IConvolutionLayer* conv_layer = nullptr;
3430   if (filter_format == "OIHW") {
3431     // Weights are already in the right order.
3432     conv_layer = params->converter->network()->addConvolution(
3433         *tensor->trt_tensor(), weights.shape_.d[0], kernel_size,
3434         weights.GetTrtWeights(), biases.GetTrtWeights());
3435   } else {
3436     // For conv, TF weights are RSCK, and TRT expects KCRS.
3437     DCHECK_EQ(filter_format, "HWIO");
3438     TRT_ShapedWeights weights_kcrs =
3439         params->weight_store->GetTempWeights(weights);
3440     ReorderRSCKToKCRS(weights, &weights_kcrs, 1);
3441     conv_layer = params->converter->network()->addConvolution(
3442         *tensor->trt_tensor(), weights.shape_.d[3], kernel_size,
3443         weights_kcrs.GetTrtWeights(), biases.GetTrtWeights());
3444   }
3445   TFTRT_RETURN_ERROR_IF_NULLPTR(conv_layer, node_def.name());
3446   conv_layer->setStride(stride);
3447   if (attrs.get<string>("padding") == "SAME") {
3448     conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3449   }
3450   params->converter->SetLayerName(conv_layer, node_def, "conv");
3451   conv_layer->setNbGroups(1);
3452   conv_layer->setDilation(dilation);
3453   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
3454 
3455   // Add activation if there is one.
3456   if (op_pair != ActivationTypeMap()->end()) {
3457     nvinfer1::IActivationLayer* activation_layer =
3458         params->converter->network()->addActivation(
3459             *output_tensor->trt_tensor(), op_pair->second);
3460     TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name());
3461     params->converter->SetLayerName(activation_layer, node_def, "activation");
3462     output_tensor = activation_layer->getOutput(0);
3463   }
3464   // Restore transpose.
3465   if (need_transpose) {
3466     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3467         output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
3468   }
3469   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3470   return Status::OK();
3471 }
3472 
ConvertPool(OpConverterParams * params)3473 Status ConvertPool(OpConverterParams* params) {
3474   const auto& inputs = params->inputs;
3475   const auto& node_def = params->node_def;
3476   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3477   std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
3478                                    DataType::DT_INT8};
3479   TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
3480   nvinfer1::PoolingType type;
3481   if (node_def.op() == "MaxPool") {
3482     type = nvinfer1::PoolingType::kMAX;
3483   } else if (node_def.op() == "AvgPool") {
3484     type = nvinfer1::PoolingType::kAVERAGE;
3485   } else {
3486     return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
3487                                  ", at ", node_def.name());
3488   }
3489   TFAttrs attrs(node_def);
3490   const string padding_type = attrs.get<string>("padding");
3491   if ((padding_type != "SAME") && (padding_type != "VALID")) {
3492     return errors::Unimplemented("Unsupported padding type: ", padding_type,
3493                                  ", at ", node_def.name());
3494   }
3495   if (params->validation_only) return Status::OK();
3496 
3497   ITensorProxyPtr tensor = inputs.at(0).tensor();
3498   int h_index = 2;
3499   int w_index = 3;
3500   const auto data_format = attrs.get<string>("data_format");
3501   if (data_format == "NHWC") {
3502     h_index = 1;
3503     w_index = 2;
3504     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3505         tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
3506   }
3507 
3508   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
3509   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
3510 
3511   const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
3512   const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
3513 
3514   nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
3515       *tensor->trt_tensor(), type, ksize);
3516   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3517 
3518   layer->setStride(stride);
3519   // VALID padding is the default TRT behavior.
3520   if (attrs.get<string>("padding") == "SAME") {
3521     // SAME_UPPER means that post padding is preferred.
3522     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
3523   }
3524   params->converter->SetLayerName(layer, node_def, "pooling");
3525   ITensorProxyPtr output_tensor = layer->getOutput(0);
3526 
3527   if (data_format == "NHWC") {
3528     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
3529         output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
3530   }
3531   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3532   return Status::OK();
3533 }
3534 
ConvertLeakyRelu(OpConverterParams * params)3535 Status ConvertLeakyRelu(OpConverterParams* params) {
3536   const auto& inputs = params->inputs;
3537   const auto& node_def = params->node_def;
3538   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3539   TF_RETURN_IF_ERROR(
3540       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3541   TFAttrs attrs(node_def);
3542   const float alpha = attrs.get<float>("alpha");
3543 
3544   // Use IActivationLayer when available.
3545   if (params->validation_only) return Status::OK();
3546 
3547   nvinfer1::IActivationLayer* layer =
3548       params->converter->network()->addActivation(
3549           *inputs.at(0).tensor()->trt_tensor(),
3550           nvinfer1::ActivationType::kLEAKY_RELU);
3551   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3552   params->converter->SetLayerName(layer, node_def, "activation");
3553   layer->setAlpha(alpha);
3554   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3555   return Status::OK();
3556 }
3557 
ConvertClipByValue(OpConverterParams * params)3558 Status ConvertClipByValue(OpConverterParams* params) {
3559   const auto& inputs = params->inputs;
3560   const auto& node_def = params->node_def;
3561   // TODO(tmorris): We can also allow the case where min and max are tensors by
3562   // using elementwise min and max layers.
3563   TF_RETURN_IF_ERROR(CheckInputsWeights(
3564       *params,
3565       {{"t", false}, {"clip_value_min", true}, {"clip_value_max", true}}));
3566   TF_RETURN_IF_ERROR(
3567       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3568   if (params->validation_only) return Status::OK();
3569 
3570   TFAttrs attrs(node_def);
3571   const DataType dtype = attrs.get<DataType>("T");
3572   float clip_value_min = 0.0f;
3573   float clip_value_max = 0.0f;
3574   // TODO(tmorris): Add a templated helper function to get scalar weights of
3575   // InType casted to OutType.
3576   if (dtype == DataType::DT_FLOAT) {
3577     clip_value_min = inputs.at(1).weights().GetSpan<float>()[0];
3578     clip_value_max = inputs.at(2).weights().GetSpan<float>()[0];
3579   } else if (dtype == DataType::DT_HALF) {
3580     clip_value_min =
3581         static_cast<float>(inputs.at(1).weights().GetSpan<Eigen::half>()[0]);
3582     clip_value_max =
3583         static_cast<float>(inputs.at(2).weights().GetSpan<Eigen::half>()[0]);
3584   }
3585 
3586   nvinfer1::IActivationLayer* layer =
3587       params->converter->network()->addActivation(
3588           *inputs.at(0).tensor()->trt_tensor(),
3589           nvinfer1::ActivationType::kCLIP);
3590   layer->setAlpha(clip_value_min);
3591   layer->setBeta(clip_value_max);
3592   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3593   params->converter->SetLayerName(layer, node_def, "activation");
3594   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3595   return Status::OK();
3596 }
3597 
3598 const std::unordered_map<string, nvinfer1::ActivationType>*
ActivationTypeMap()3599 ActivationTypeMap() {
3600   static auto* const m =
3601       new std::unordered_map<string, nvinfer1::ActivationType>({
3602         {"Relu", nvinfer1::ActivationType::kRELU},
3603             {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
3604             {"Tanh", nvinfer1::ActivationType::kTANH},
3605             {"Elu", nvinfer1::ActivationType::kELU},
3606             {"Selu", nvinfer1::ActivationType::kSELU},
3607             {"Softsign", nvinfer1::ActivationType::kSOFTSIGN},
3608             {"Softplus", nvinfer1::ActivationType::kSOFTPLUS},
3609       });
3610   return m;
3611 }
3612 
ConvertActivation(OpConverterParams * params)3613 Status ConvertActivation(OpConverterParams* params) {
3614   const auto& inputs = params->inputs;
3615   const auto& node_def = params->node_def;
3616   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3617   TF_RETURN_IF_ERROR(
3618       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3619   auto op_pair = ActivationTypeMap()->find(node_def.op());
3620   if (op_pair == ActivationTypeMap()->end()) {
3621     return errors::Unimplemented("Activation op: ", node_def.op(),
3622                                  " not supported at: ", node_def.name());
3623   }
3624   if (params->validation_only) return Status::OK();
3625 
3626   // Start conversion.
3627   nvinfer1::IActivationLayer* layer =
3628       params->converter->network()->addActivation(
3629           *inputs.at(0).tensor()->trt_tensor(), op_pair->second);
3630   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3631   params->converter->SetLayerName(layer, node_def, "activation");
3632   // Set parameters.
3633   if (node_def.op() == "Elu") {
3634     layer->setAlpha(1.0f);
3635   } else if (node_def.op() == "Selu") {
3636     // From tensorflow/core/kernels/relu_op_functor.h
3637     layer->setAlpha(1.7580993408473768599402175208123f);
3638     layer->setBeta(1.0507009873554804934193349852946f);
3639   } else if (node_def.op() == "Softplus") {
3640     layer->setAlpha(1.0f);
3641     layer->setBeta(1.0f);
3642   }
3643   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3644   return Status::OK();
3645 }
3646 
ConvertQuantize(OpConverterParams * params)3647 Status ConvertQuantize(OpConverterParams* params) {
3648   const auto& inputs = params->inputs;
3649   const auto& node_def = params->node_def;
3650   if (node_def.op() == "FakeQuantWithMinMaxArgs") {
3651     TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3652   } else if (node_def.op() == "FakeQuantWithMinMaxVars") {
3653     TF_RETURN_IF_ERROR(CheckInputsWeights(
3654         *params, {{"input", false}, {"min", true}, {"max", true}}));
3655   } else if (node_def.op() == "QuantizeAndDequantizeV2") {
3656     TF_RETURN_IF_ERROR(CheckInputsWeights(
3657         *params, {{"input", false}, {"input_min", true}, {"input_max", true}}));
3658   } else if (node_def.op() == "QuantizeAndDequantizeV3") {
3659     TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
3660                                                     {"input_min", true},
3661                                                     {"input_max", true},
3662                                                     {"num_bits", true}}));
3663   }
3664   float min_range = 0.0f;
3665   float max_range = 0.0f;
3666   if (node_def.op() == "FakeQuantWithMinMaxArgs") {
3667     // Get ranges via node attributes.
3668     TFAttrs attrs(node_def);
3669     if (attrs.count("min") == 0 || attrs.count("max") == 0) {
3670       return errors::InvalidArgument("Min or max attribute not found for ",
3671                                      node_def.op(), " at ", node_def.name());
3672     }
3673     min_range = attrs.get<float>("min");
3674     max_range = attrs.get<float>("max");
3675   } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
3676              node_def.op() == "QuantizeAndDequantizeV2" ||
3677              node_def.op() == "QuantizeAndDequantizeV3") {
3678     // Get ranges via inputs.
3679     auto get_weights_value = [&inputs](int index) {
3680       auto raw_weights =
3681           static_cast<float*>(inputs.at(index).weights().GetValues());
3682       return raw_weights[0];
3683     };
3684     min_range = get_weights_value(1);
3685     max_range = get_weights_value(2);
3686   } else {
3687     return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
3688                                    ", at ", node_def.name());
3689   }
3690   if (params->validation_only) return Status::OK();
3691 
3692   // Store ranges for tensor
3693   ITensorProxyPtr input0 = inputs.at(0).tensor();
3694   params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
3695   // Sometimes, TRT may not quantize a tensor, either because it chooses to
3696   // execute a higher precision kernel or because of op fusion. In these cases,
3697   // accuracy will suffer if the model was trained to expect quantization at
3698   // that tensor. We should consider adding a clip(tensor, min_range, max_range)
3699   // operation here to ensure that any arbitrarily placed quantize node will
3700   // execute as expected. However, this will negatively affect performance. If
3701   // users train their models in a way which models inference as close as
3702   // possible (i.e. not quantizing in place where fusion will occur), then there
3703   // is no problem with the current implementation.
3704   params->outputs->push_back(inputs.at(0));
3705   return Status::OK();
3706 }
3707 
ConvertRelu6(OpConverterParams * params)3708 Status ConvertRelu6(OpConverterParams* params) {
3709   const auto& inputs = params->inputs;
3710   const auto& node_def = params->node_def;
3711   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
3712   TF_RETURN_IF_ERROR(
3713       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3714   if (params->validation_only) return Status::OK();
3715 
3716   nvinfer1::IActivationLayer* layer =
3717       params->converter->network()->addActivation(
3718           *inputs.at(0).tensor()->trt_tensor(),
3719           nvinfer1::ActivationType::kCLIP);
3720   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3721   layer->setAlpha(0.0f);
3722   layer->setBeta(6.0f);
3723   params->converter->SetLayerName(layer, node_def, "activation");
3724   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
3725   return Status::OK();
3726 }
3727 
ConvertBiasAddInt8WithoutCalibration(OpConverterParams * params)3728 Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
3729   const auto& inputs = params->inputs;
3730   const auto& node_def = params->node_def;
3731   TF_RETURN_IF_ERROR(
3732       CheckInputsWeights(*params, {{"value", false}, {"bias", true}}));
3733   TF_RETURN_IF_ERROR(
3734       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3735   if (params->validation_only) return Status::OK();
3736 
3737   ITensorProxyPtr tensor = inputs.at(0).tensor();
3738   const nvinfer1::Dims original_dims = tensor->getDimensions();
3739   TFAttrs attrs(node_def);
3740   const string data_format = attrs.get<string>("data_format");
3741   const int channel_index =
3742       (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
3743 
3744   nvinfer1::Permutation permutation;
3745   if (channel_index != 0) {
3746     // Permute the dimensions so that the channel dimension is the first
3747     // dimension.
3748     for (int i = 0; i < original_dims.nbDims; ++i) {
3749       permutation.order[i] = i;
3750     }
3751     permutation.order[0] = channel_index;
3752     permutation.order[channel_index] = 0;
3753     VLOG(1) << "ConvertBiasAdd permutation: "
3754             << DebugString(permutation, original_dims.nbDims);
3755   }
3756 
3757   // TensorRT addScale requires input to be of rank 3, we need to apply
3758   // transpose as well as reshape.
3759   // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
3760   if (channel_index != 0 || original_dims.nbDims != 3) {
3761     nvinfer1::IShuffleLayer* shuffle_layer =
3762         params->converter->network()->addShuffle(*tensor->trt_tensor());
3763     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
3764     params->converter->SetLayerName(shuffle_layer, node_def, "shuffle",
3765                                     /*op_instance=*/0);
3766 
3767     // NOTE(laigd): for some reason we need to apply the reshape
3768     // unconditionally. The default shape has nbDims==-1 and it seems the
3769     // behavior is undefined in some cases.
3770     nvinfer1::Dims reshape_dims;
3771     reshape_dims.nbDims = 3;
3772     // 0 means copying from input; -1 means inferring from the rest.
3773     reshape_dims.d[0] = 0;
3774     reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
3775     reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
3776     shuffle_layer->setReshapeDimensions(reshape_dims);
3777 
3778     if (channel_index != 0) {
3779       shuffle_layer->setFirstTranspose(permutation);
3780     }
3781     tensor = shuffle_layer->getOutput(0);
3782   }
3783 
3784   TRT_ShapedWeights weights = inputs.at(1).weights();
3785   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
3786   if (weights.shape_.d[0] == 1) {
3787     mode = nvinfer1::ScaleMode::kUNIFORM;
3788   }
3789 
3790   TRT_ShapedWeights empty_weights(weights.TrtDType());
3791   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
3792       *tensor->trt_tensor(), mode, weights.GetTrtWeights(),
3793       empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights());
3794   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3795   params->converter->SetLayerName(layer, node_def, "scale");
3796 
3797   ITensorProxyPtr output_tensor = layer->getOutput(0);
3798 
3799   // Restore transpose & reshape.
3800   if (channel_index != 0 || original_dims.nbDims != 3) {
3801     nvinfer1::IShuffleLayer* shuffle_layer =
3802         params->converter->network()->addShuffle(*output_tensor->trt_tensor());
3803     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
3804     params->converter->SetLayerName(shuffle_layer, node_def, "shuffle",
3805                                     /*op_instance=*/1);
3806     // NOTE: for same reason as mentioned above we need to apply the reshape
3807     // unconditionally.
3808     nvinfer1::Dims reshape_dims = original_dims;
3809     if (channel_index != 0) {
3810       // NOTE: according to NVIDIA dimension types are deprecated, so we don't
3811       // need to copy them back.
3812       reshape_dims.d[channel_index] = original_dims.d[0];
3813       reshape_dims.d[0] = original_dims.d[channel_index];
3814     }
3815     shuffle_layer->setReshapeDimensions(reshape_dims);
3816 
3817     if (channel_index != 0) {
3818       shuffle_layer->setSecondTranspose(permutation);
3819     }
3820     output_tensor = shuffle_layer->getOutput(0);
3821   }
3822 
3823   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3824   return Status::OK();
3825 }
3826 
ConvertBiasAdd(OpConverterParams * params)3827 Status ConvertBiasAdd(OpConverterParams* params) {
3828   if (params->precision_mode == TrtPrecisionMode::INT8 &&
3829       !params->use_calibration) {
3830     // NOTE(laigd): based on some observation, it seems TensorRT cannot fuse
3831     // IConvolutionLayer and IElementwiseLayer and will require range
3832     // information for the output of Conv2D. Using IScaleLayer will fix the
3833     // problem.
3834     return ConvertBiasAddInt8WithoutCalibration(params);
3835   }
3836   const auto& inputs = params->inputs;
3837   const auto& node_def = params->node_def;
3838 
3839   if (inputs.size() != 2) {
3840     return errors::InvalidArgument(
3841         "BiasAdd expects exactly 2 inputs, but received ", inputs.size());
3842   }
3843 
3844   if (inputs[0].is_weights() && inputs[1].is_weights()) {
3845     return errors::InvalidArgument(
3846         "All inputs are weights, but Grappler is expected to fold them.");
3847   }
3848 
3849   TF_RETURN_IF_ERROR(
3850       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
3851 
3852   TFAttrs attrs(node_def);
3853   const string& data_format = attrs.get<string>("data_format");
3854 
3855   nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims();
3856   nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims();
3857   // The bias input arg is a 1-D tensor with length C. If the input is NCHW,
3858   // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1].
3859   if (data_format == "NCHW") {
3860     if (params->use_implicit_batch) {
3861       // The batch dim is not included in implicit batch mode, so the shape of
3862       // the bias tensor is [C, 1, 1].
3863       bias_shape.nbDims = input_shape.nbDims;
3864       std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
3865     } else {
3866       // In explicit batch mode we create a tensor with shape [1, C, 1, 1].
3867       std::vector<int> bias_shape_vec(bias_shape.d,
3868                                       bias_shape.d + bias_shape.nbDims);
3869       // Insert 1 before for batch dim
3870       bias_shape_vec.insert(bias_shape_vec.begin(), 1);
3871       // Trail with 1s to match input_shape size
3872       bias_shape_vec.insert(bias_shape_vec.end(),
3873                             input_shape.nbDims - bias_shape_vec.size(), 1);
3874       TF_RETURN_IF_ERROR(ContainerToTrtDims(bias_shape_vec, &bias_shape));
3875     }
3876   } else {
3877     // Next, broadcast the bias across the input.
3878     TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1),
3879                                             /*check_feasibility=*/true,
3880                                             params->use_implicit_batch,
3881                                             &input_shape, &bias_shape));
3882   }
3883 
3884   // Convert input to a TRT tensor
3885   ITensorProxyPtr input_tensor{nullptr};
3886   TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(0),
3887                                            input_shape, params->validation_only,
3888                                            &input_tensor, node_def,
3889                                            /*op_instance=*/0));
3890 
3891   // Finally, reshape bias. Since the bias is usually a constant, this will
3892   // normally happen at conversion-time.
3893   ITensorProxyPtr bias_tensor{nullptr};
3894   TF_RETURN_IF_ERROR(PrepareTensorForShape(params->converter, inputs.at(1),
3895                                            bias_shape, params->validation_only,
3896                                            &bias_tensor, node_def,
3897                                            /*op_instance=*/1));
3898   VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape);
3899 
3900   if (params->validation_only) return Status::OK();
3901 
3902   nvinfer1::IElementWiseLayer* layer =
3903       params->converter->network()->addElementWise(
3904           *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(),
3905           nvinfer1::ElementWiseOperation::kSUM);
3906   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
3907   params->converter->SetLayerName(layer, node_def, "sum");
3908   ITensorProxyPtr output_tensor = layer->getOutput(0);
3909 
3910   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
3911   return Status::OK();
3912 }
3913 
GetTensorDimsWithProtoShape(const Tensor & tensor,nvinfer1::Dims * dims)3914 void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
3915   if (tensor.dims() > 0) {
3916     *dims = GetTrtDimsForTensor(tensor);
3917   } else {
3918     dims->nbDims = 0;  // Use scalar weights to implement scalar constants.
3919     // No dimension provided. Flatten it.
3920     dims->d[0] = tensor.NumElements();
3921     for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
3922       dims->d[i] = 0;
3923     }
3924   }
3925 }
3926 
3927 template <typename Input>
IsIntegerInInt32Bounds(const Input & inp)3928 inline bool IsIntegerInInt32Bounds(const Input& inp) {
3929   static_assert(std::is_integral<Input>::value,
3930                 "This function is only implemented for integral types.");
3931   // If Input is always within the range of int32, return true.
3932   if (sizeof(Input) < sizeof(int32) || std::is_same<Input, int32>::value) {
3933     return true;
3934   }
3935   // Otherwise, we need to check the value of the input. If the input is
3936   // unsigned, we only check the upper bound.
3937   if (!std::numeric_limits<Input>::is_signed) {
3938     return inp <= static_cast<Input>(std::numeric_limits<int32>::max());
3939   }
3940   // We can safely cast lowest() here since we now know that Input is signed and
3941   // sizeof(Input) >= sizeof(int32)
3942   return (inp >= static_cast<Input>(std::numeric_limits<int32>::lowest()) &&
3943           inp <= static_cast<Input>(std::numeric_limits<int32>::max()));
3944 }
3945 
3946 template <DataType dtype>
CopyToTrtInt32Array(const Tensor & tensor,int32 * dst)3947 Status CopyToTrtInt32Array(const Tensor& tensor, int32* dst) {
3948   typedef typename EnumToDataType<dtype>::Type CType;
3949   const CType* src = tensor.flat<CType>().data();
3950   for (int i = 0; i < tensor.NumElements(); ++i) {
3951     // This becomes a no-op if CType is within bounds of int32
3952     if (!IsIntegerInInt32Bounds(src[i])) {
3953       return errors::InvalidArgument("Value at index ", i,
3954                                      " is outside the range of int32");
3955     }
3956     dst[i] = static_cast<int32>(src[i]);
3957   }
3958   return Status::OK();
3959 }
3960 
TfTensorToTrtWeights(const Tensor & tensor,TrtWeightStore * weight_store,TRT_ShapedWeights * weights)3961 Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
3962                             TRT_ShapedWeights* weights) {
3963   const DataType dtype = tensor.dtype();
3964 
3965   // We always convert the integer constants to INT32.
3966   //
3967   // TODO(aaroey): FP16 will remain in half format and is not converted to
3968   // FP32, but the converter currently uses all float weights as FP32. Fix
3969   // this.
3970   DataType converted_dtype = DataTypeIsInteger(dtype) ? DT_INT32 : dtype;
3971 
3972   // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
3973   nvinfer1::DataType trt_dtype;
3974   TF_RETURN_IF_ERROR(TfTypeToTrtType(converted_dtype, &trt_dtype));
3975 
3976   if (tensor.NumElements() == 0) {
3977     // Return empty weights.
3978     *weights = TRT_ShapedWeights(trt_dtype);
3979     return Status::OK();
3980   }
3981 
3982   nvinfer1::Dims weight_dims;
3983   GetTensorDimsWithProtoShape(tensor, &weight_dims);
3984   *weights = weight_store->GetTempWeights(trt_dtype, weight_dims);
3985 
3986   // Copy the tensor directly if the tensor does not require cast to the
3987   // supported type.
3988   if (converted_dtype == dtype) {
3989     char* dst = static_cast<char*>(weights->GetValues());
3990     memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
3991     return Status::OK();
3992   }
3993 
3994   Status status = Status::OK();
3995   // Copy tensor elements after casting them to the converted DataType.
3996   int32* dst = static_cast<int32*>(weights->GetValues());
3997   switch (dtype) {
3998     case DT_INT8:
3999       status = CopyToTrtInt32Array<DT_INT8>(tensor, dst);
4000       break;
4001     case DT_UINT8:
4002       status = CopyToTrtInt32Array<DT_UINT8>(tensor, dst);
4003       break;
4004     case DT_INT16:
4005       status = CopyToTrtInt32Array<DT_INT16>(tensor, dst);
4006       break;
4007     case DT_UINT16:
4008       status = CopyToTrtInt32Array<DT_UINT16>(tensor, dst);
4009       break;
4010     case DT_UINT32:
4011       status = CopyToTrtInt32Array<DT_UINT32>(tensor, dst);
4012       break;
4013     case DT_INT64:
4014       status = CopyToTrtInt32Array<DT_INT64>(tensor, dst);
4015       break;
4016     case DT_UINT64:
4017       status = CopyToTrtInt32Array<DT_UINT64>(tensor, dst);
4018       break;
4019     default:
4020       return errors::Internal("Unexpected DataType: ", DataTypeString(dtype));
4021   }
4022   return status;
4023 }
4024 
4025 // Convert a Const NodeDef to TRT_ShapedWeights. This is a special converter, it
4026 // always ignores the params->validation_only parameter but adds the converted
4027 // weights to params->outputs. We did this since TrtNodeValidator needs the
4028 // weights as input to other nodes, and use it to determine whether those nodes
4029 // are supported by TRT.
ConvertConst(OpConverterParams * params)4030 Status ConvertConst(OpConverterParams* params) {
4031   const auto& inputs = params->inputs;
4032   const auto& node_def = params->node_def;
4033   if (!inputs.empty()) {
4034     return errors::InvalidArgument(
4035         "Constant node is expected to have empty input list: ",
4036         node_def.name());
4037   }
4038 
4039   // Create shaped weights as output
4040   const auto& tensor_proto = node_def.attr().at("value").tensor();
4041   Tensor tensor;
4042   if (!tensor.FromProto(tensor_proto)) {
4043     return errors::Internal("Cannot parse weight tensor proto: ",
4044                             node_def.name());
4045   }
4046 
4047   TFAttrs attrs(node_def);
4048   const DataType dtype = attrs.get<DataType>("dtype");
4049   if (dtype != tensor.dtype()) {
4050     return errors::InvalidArgument("DataType mismatch between attr (",
4051                                    DataTypeString(dtype), ") and tensor (",
4052                                    DataTypeString(tensor.dtype()), ")");
4053   }
4054 
4055   TRT_ShapedWeights weights;
4056   TF_RETURN_IF_ERROR(
4057       TfTensorToTrtWeights(tensor, params->weight_store, &weights));
4058 
4059   if (params->outputs != nullptr) {
4060     params->outputs->push_back(TRT_TensorOrWeights(weights));
4061   }
4062   return Status::OK();
4063 }
4064 
ConvertIdentity(OpConverterParams * params)4065 Status ConvertIdentity(OpConverterParams* params) {
4066   // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
4067   // 5.0, however once we know that it does it would be nice to use that
4068   // instead.
4069   if (params->validation_only) return Status::OK();
4070   params->outputs->push_back(params->inputs.at(0));
4071   return Status::OK();
4072 }
4073 
4074 const std::unordered_map<string, nvinfer1::ElementWiseOperation>*
BinaryOperationMap()4075 BinaryOperationMap() {
4076   static auto* const m =
4077       new std::unordered_map<string, nvinfer1::ElementWiseOperation> {
4078     {"Add", nvinfer1::ElementWiseOperation::kSUM},
4079         {"AddV2", nvinfer1::ElementWiseOperation::kSUM},
4080         {"Mul", nvinfer1::ElementWiseOperation::kPROD},
4081         {"Sub", nvinfer1::ElementWiseOperation::kSUB},
4082         {"Div", nvinfer1::ElementWiseOperation::kDIV},
4083         {"FloorDiv", nvinfer1::ElementWiseOperation::kFLOOR_DIV},
4084         {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
4085         {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
4086         {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
4087         {"Pow", nvinfer1::ElementWiseOperation::kPOW},
4088   };
4089   return m;
4090 }
4091 
ConvertBinary(OpConverterParams * params)4092 Status ConvertBinary(OpConverterParams* params) {
4093   const auto& inputs = params->inputs;
4094   const auto& node_def = params->node_def;
4095   if (inputs.size() != 2) {
4096     return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
4097                                    " inputs but expected 2, at ",
4098                                    node_def.name());
4099   }
4100   std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4101                                    DataType::DT_INT32};
4102   TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4103 
4104   // Constant folding should have been done by TensorFlow
4105   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
4106     return errors::Unimplemented(
4107         "Constant folding is falled back to TensorFlow, binary op received "
4108         "both input as constant at: ",
4109         node_def.name());
4110   }
4111   const TRT_TensorOrWeights& operand_l = inputs.at(0);
4112   const TRT_TensorOrWeights& operand_r = inputs.at(1);
4113 
4114   auto op_pair = BinaryOperationMap()->find(node_def.op());
4115   if (op_pair == BinaryOperationMap()->end()) {
4116     return errors::Unimplemented("Binary op ", node_def.op(),
4117                                  " not supported at: ", node_def.name());
4118   }
4119 
4120   nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
4121   TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
4122       operand_l, operand_r, /*check_feasibility=*/true,
4123       params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
4124   ITensorProxyPtr tensor_l = nullptr;
4125   ITensorProxyPtr tensor_r = nullptr;
4126   // This will also convert constants to tensors.
4127   TF_RETURN_IF_ERROR(PrepareTensorForShape(
4128       params->converter, operand_l, broadcasted_dims_l, params->validation_only,
4129       &tensor_l, node_def, /*op_instance=*/0));
4130   TF_RETURN_IF_ERROR(PrepareTensorForShape(
4131       params->converter, operand_r, broadcasted_dims_r, params->validation_only,
4132       &tensor_r, node_def, /*op_instance=*/1));
4133   if (params->validation_only) return Status::OK();
4134 
4135   // Add ElementWise layer.
4136   nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
4137       *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), op_pair->second);
4138   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4139   params->converter->SetLayerName(layer, node_def);
4140   ITensorProxyPtr trt_tensor = layer->getOutput(0);
4141 
4142   params->outputs->push_back(TRT_TensorOrWeights(trt_tensor));
4143   return Status::OK();
4144 }
4145 
ConvertRsqrt(OpConverterParams * params)4146 Status ConvertRsqrt(OpConverterParams* params) {
4147   const auto& inputs = params->inputs;
4148   const auto& node_def = params->node_def;
4149   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4150   TF_RETURN_IF_ERROR(
4151       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4152   if (params->validation_only) return Status::OK();
4153 
4154   // Start conversion.
4155   ITensorProxyPtr tensor = inputs.at(0).tensor();
4156   // Sqrt
4157   nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
4158       *tensor->trt_tensor(), nvinfer1::UnaryOperation::kSQRT);
4159   TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
4160   params->converter->SetLayerName(sqrt_layer, node_def, "sqrt");
4161   // Recip
4162   nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
4163       *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
4164   TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
4165   params->converter->SetLayerName(recip_layer, node_def, "recip");
4166   params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
4167   return Status::OK();
4168 }
4169 
4170 const std::unordered_map<string, nvinfer1::UnaryOperation>*
UnaryOperationMap()4171 UnaryOperationMap() {
4172   static auto* const m =
4173       new std::unordered_map<string, nvinfer1::UnaryOperation>({
4174         {"Neg", nvinfer1::UnaryOperation::kNEG},
4175             {"Exp", nvinfer1::UnaryOperation::kEXP},
4176             {"Log", nvinfer1::UnaryOperation::kLOG},
4177             {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
4178             {"Abs", nvinfer1::UnaryOperation::kABS},
4179             {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
4180             {"Sin", nvinfer1::UnaryOperation::kSIN},
4181             {"Cos", nvinfer1::UnaryOperation::kCOS},
4182             {"Tan", nvinfer1::UnaryOperation::kTAN},
4183             {"Sinh", nvinfer1::UnaryOperation::kSINH},
4184             {"Cosh", nvinfer1::UnaryOperation::kCOSH},
4185             {"Asin", nvinfer1::UnaryOperation::kASIN},
4186             {"Acos", nvinfer1::UnaryOperation::kACOS},
4187             {"Atan", nvinfer1::UnaryOperation::kATAN},
4188             {"Asinh", nvinfer1::UnaryOperation::kASINH},
4189             {"Acosh", nvinfer1::UnaryOperation::kACOSH},
4190             {"Atanh", nvinfer1::UnaryOperation::kATANH},
4191             {"Ceil", nvinfer1::UnaryOperation::kCEIL},
4192             {"Floor", nvinfer1::UnaryOperation::kFLOOR},
4193             {"Erf", nvinfer1::UnaryOperation::kERF},
4194       });
4195   return m;
4196 }
4197 
ConvertUnary(OpConverterParams * params)4198 Status ConvertUnary(OpConverterParams* params) {
4199   const auto& inputs = params->inputs;
4200   const auto& node_def = params->node_def;
4201   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4202   TF_RETURN_IF_ERROR(
4203       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4204   auto op_pair = UnaryOperationMap()->find(node_def.op());
4205   if (op_pair == UnaryOperationMap()->end()) {
4206     return errors::Unimplemented("Unary op: ", node_def.op(),
4207                                  " not supported at: ", node_def.name());
4208   }
4209   if (params->validation_only) return Status::OK();
4210 
4211   // Start conversion.
4212   ITensorProxyPtr tensor = inputs.at(0).tensor();
4213   nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
4214       *tensor->trt_tensor(), op_pair->second);
4215   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4216   params->converter->SetLayerName(layer, node_def);
4217   ITensorProxyPtr output_tensor = layer->getOutput(0);
4218 
4219   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4220   return Status::OK();
4221 }
4222 
ConvertSquare(OpConverterParams * params)4223 Status ConvertSquare(OpConverterParams* params) {
4224   const auto& inputs = params->inputs;
4225   const auto& node_def = params->node_def;
4226   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4227   TF_RETURN_IF_ERROR(AllowDataTypes(
4228       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
4229   if (params->validation_only) return Status::OK();
4230 
4231   // Constant 2 with same rank as input
4232   ITensorProxyPtr const2_tensor = nullptr;
4233   TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
4234       params, 2.0f, inputs.at(0).GetTrtDims(), &const2_tensor));
4235 
4236   // ElementWise Pow Operation
4237   nvinfer1::IElementWiseLayer* layer =
4238       params->converter->network()->addElementWise(
4239           *inputs.at(0).tensor()->trt_tensor(), *const2_tensor->trt_tensor(),
4240           nvinfer1::ElementWiseOperation::kPOW);
4241   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4242   params->converter->SetLayerName(layer, node_def);
4243   ITensorProxyPtr output_tensor = layer->getOutput(0);
4244 
4245   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4246   return Status::OK();
4247 }
4248 
ConvertReduce(OpConverterParams * params)4249 Status ConvertReduce(OpConverterParams* params) {
4250   const auto& inputs = params->inputs;
4251   const auto& node_def = params->node_def;
4252   TF_RETURN_IF_ERROR(
4253       CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
4254   TF_RETURN_IF_ERROR(
4255       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4256 
4257   ITensorProxyPtr tensor = inputs.at(0).tensor();
4258   auto tf_axes_list = inputs.at(1).weights().GetSpan<int>();
4259 
4260   TFAttrs attrs(node_def);
4261   // Only expect to handle INT32 as attributes for now
4262   if (attrs.get<DataType>("Tidx") != DataType::DT_INT32) {
4263     return errors::Unimplemented("Tidx supports only DT_INT32");
4264   }
4265 
4266   int axes = 0;
4267   if (tf_axes_list.size() == 0) {
4268     return errors::InvalidArgument(
4269         "TRT cannot support reduce on all (batch) dimensions, at",
4270         node_def.name());
4271   }
4272   for (int i = 0; i < tf_axes_list.size(); i++) {
4273     int trt_axis;
4274     TF_RETURN_IF_ERROR(
4275         ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims,
4276                     node_def.name(), params->use_implicit_batch, &trt_axis));
4277     axes |= (1 << trt_axis);
4278   }
4279 
4280   nvinfer1::ReduceOperation reduce_operation;
4281   if (node_def.op() == "Sum") {
4282     reduce_operation = nvinfer1::ReduceOperation::kSUM;
4283   } else if (node_def.op() == "Prod") {
4284     reduce_operation = nvinfer1::ReduceOperation::kPROD;
4285   } else if (node_def.op() == "Max") {
4286     reduce_operation = nvinfer1::ReduceOperation::kMAX;
4287   } else if (node_def.op() == "Min") {
4288     reduce_operation = nvinfer1::ReduceOperation::kMIN;
4289   } else if (node_def.op() == "Mean") {
4290     reduce_operation = nvinfer1::ReduceOperation::kAVG;
4291   } else {
4292     return errors::Unimplemented("Op not supported ", node_def.op(), ", at ",
4293                                  node_def.name());
4294   }
4295   if (params->validation_only) return Status::OK();
4296 
4297   const auto keep_dims = attrs.get<bool>("keep_dims");
4298   nvinfer1::ILayer* layer = params->converter->network()->addReduce(
4299       *tensor->trt_tensor(), reduce_operation, axes, keep_dims);
4300   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4301   params->converter->SetLayerName(layer, node_def);
4302 
4303   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4304   return Status::OK();
4305 }
4306 
4307 // TensorRT does not support the Pack op natively. Therefore, Pack op is
4308 // converted by first expanding input tensors by adding a new dimension of size
4309 // one at the specified axis and then concatenating the tensors at the same
4310 // axis.
ConvertPack(OpConverterParams * params)4311 Status ConvertPack(OpConverterParams* params) {
4312   const auto& inputs = params->inputs;
4313   const auto& node_def = params->node_def;
4314 
4315   TFAttrs attrs(node_def);
4316   const int num_inputs = attrs.get<int64>("N");
4317   if (num_inputs != inputs.size()) {
4318     return errors::InvalidArgument(
4319         "Number of inputs for Pack is inconsistent with N attribute, at ",
4320         node_def.name());
4321   }
4322 
4323   // In implicit batch mode we do not allow weight input. An input tensor with
4324   // dims NCHW is represented with dims CHW during conversion time, and N is
4325   // defined only during runtime. A weight is represented with dims NCHW. We
4326   // cannot be sure that the runtime N will agree with the conversion time N,
4327   // therefore we do not convert the pack op if it has both tensor and weight
4328   // inputs. This restriction does not apply in explicit batch mode, in that
4329   // case the input tensors are also represented with full dims that include the
4330   // batch size.
4331   TrtInputArg expected_arg =
4332       params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth;
4333 
4334   std::vector<std::pair<string, TrtInputArg>> inputs_is_weight;
4335   inputs_is_weight.reserve(num_inputs);
4336   for (int i = 0; i < num_inputs; ++i) {
4337     inputs_is_weight.push_back({StrCat("values_", i), expected_arg});
4338   }
4339   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight));
4340 
4341   std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4342                                    DataType::DT_INT32};
4343   TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4344   if (num_inputs > 1) {
4345     // Verify that inputs are compatible for concatenation after the expansion.
4346     TF_RETURN_IF_ERROR(
4347         VerifyShapesMatch(inputs, /*masked_dim=*/-1, node_def.name()));
4348   }
4349 
4350   // Find the dimension of the inputs. In general inputs can have dynamic shape,
4351   // in that case we have to use DynamicExpandDims to calculate the expanded
4352   // dimensions. To avoid that, we try to find a weight input which is
4353   // guaranteed to have known static shape.
4354   int idx = 0;
4355   for (int i = 1; i < inputs.size(); i++) {
4356     if (HasStaticShape(inputs.at(i).GetTrtDims())) {
4357       idx = i;
4358     }
4359   }
4360   const nvinfer1::Dims dims = inputs.at(idx).GetTrtDims();
4361   // Convert axis from the TensorFlow format to TensorRT format.
4362   const int64 tf_axis = attrs.get<int64>("axis");
4363   int trt_axis;
4364   TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims + 1, node_def.name(),
4365                                  params->use_implicit_batch, &trt_axis));
4366 
4367   // Compute expanded dimensions and then reshape input tensors.
4368   std::vector<int> tensor_dims(dims.d, dims.d + dims.nbDims);
4369   tensor_dims.insert(tensor_dims.begin() + trt_axis, 1);
4370   nvinfer1::Dims expanded_dims;
4371   TF_RETURN_IF_ERROR(ContainerToTrtDims(tensor_dims, &expanded_dims));
4372   std::vector<ITensorProxyPtr> expanded_tensors;
4373   int input_index = 0;
4374   for (const TRT_TensorOrWeights& input : inputs) {
4375     ITensorProxyPtr expanded_tensor = nullptr;
4376     if (input.is_tensor() && !params->use_implicit_batch &&
4377         !HasStaticShape(dims)) {
4378       if (!params->validation_only) {
4379         TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
4380             input.tensor(), dims, trt_axis, params, &expanded_tensor,
4381             input_index));
4382       }
4383     } else {
4384       TF_RETURN_IF_ERROR(PrepareTensorForShape(
4385           params->converter, input, expanded_dims, params->validation_only,
4386           &expanded_tensor, node_def, input_index));
4387     }
4388     if (!params->validation_only) {
4389       expanded_tensors.push_back(expanded_tensor);
4390     }
4391     input_index++;
4392   }
4393   if (params->validation_only) return Status::OK();
4394 
4395   // If there is only one tensor in the input, return the expanded tensor.
4396   if (num_inputs == 1) {
4397     params->outputs->push_back(TRT_TensorOrWeights(expanded_tensors[0]));
4398     return Status::OK();
4399   }
4400 
4401   // Otherwise, concatenate expanded tensors.
4402   std::vector<nvinfer1::ITensor*> trt_expanded_tensors;
4403   for (const auto& t : expanded_tensors) {
4404     trt_expanded_tensors.push_back(t->trt_tensor());
4405   }
4406   nvinfer1::IConcatenationLayer* layer =
4407       params->converter->network()->addConcatenation(
4408           static_cast<nvinfer1::ITensor* const*>(trt_expanded_tensors.data()),
4409           expanded_tensors.size());
4410   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4411   params->converter->SetLayerName(layer, node_def, "concat");
4412   // Note that trt_axis stays the same even after expanding tensors at the axis.
4413   layer->setAxis(trt_axis);
4414   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4415   return Status::OK();
4416 }
4417 
ConvertPad(OpConverterParams * params)4418 Status ConvertPad(OpConverterParams* params) {
4419   const auto& inputs = params->inputs;
4420   const auto& node_def = params->node_def;
4421   TF_RETURN_IF_ERROR(
4422       CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}}));
4423   TF_RETURN_IF_ERROR(AllowDataTypes(
4424       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT8}));
4425 
4426   // Implement tensor binaryOp weight [channel wise] for now;
4427   ITensorProxyPtr tensor = inputs.at(0).tensor();
4428   const auto dims = tensor->getDimensions();
4429   // Restore implicit batch dimension
4430   const int nb_dims =
4431       params->use_implicit_batch ? dims.nbDims + 1 : dims.nbDims;
4432 
4433   // TODO(tfeher): Support nb_dims < 4 by inserting extra dimensions to the
4434   // original input.
4435   if (nb_dims < 4) {
4436     return errors::InvalidArgument("Convertpad requires at least 4D input, at ",
4437                                    node_def.name());
4438   }
4439   TRT_ShapedWeights pads = inputs.at(1).weights();
4440 
4441   TFAttrs attrs(node_def);
4442   // Padding type here is done through TF type
4443   //   so I can leverage their EnumToDataType for my cast
4444   auto padding_type = attrs.get<DataType>("Tpaddings");
4445   // TODO(jie): handle data type conversion for TRT?
4446 
4447   if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) {
4448     return errors::InvalidArgument("Paddings at ", node_def.name(),
4449                                    " must be a weight with shape [n, 2], "
4450                                    "where n is the rank of input tensor");
4451   }
4452 
4453   // Only expect to handle INT32 as attributes for now
4454   if (padding_type != DataType::DT_INT32) {
4455     return errors::Unimplemented("Tpaddings supports only DT_INT32");
4456   }
4457   auto pad_data = static_cast<int*>(pads.GetValues());
4458 
4459   std::vector<int32_t> tf_pad_index;
4460   for (int i = 0; i < nb_dims; i++) {
4461     if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) {
4462       tf_pad_index.push_back(i);
4463     }
4464   }
4465 
4466   // No padding at all, we should exit
4467   if (tf_pad_index.empty()) {
4468     params->outputs->push_back(inputs.at(0));
4469     return Status::OK();
4470   }
4471 
4472   // TRT pad layer can only support padding on up to 2 dimensions (TRT-2579).
4473   // TODO(tfeher): Use multiple TRT pad layers to support padding on more than 2
4474   // dimensions.
4475   if (tf_pad_index.size() > 2) {
4476     return errors::InvalidArgument(
4477         "Padding layer does not support padding on > 2");
4478   }
4479 
4480   // Padding on batch dimension is not supported
4481   if (params->use_implicit_batch && tf_pad_index[0] == 0) {
4482     return errors::InvalidArgument(
4483         "Padding layer does not support padding on batch dimension");
4484   }
4485 
4486   if (params->validation_only) return Status::OK();
4487 
4488   // TRT can only do the padding at the last two dimensions. We transpose the
4489   // input tensor if needed.
4490   bool transposed_pad = false;
4491   std::vector<int> transpose_idx(nb_dims);
4492   std::iota(transpose_idx.begin(), transpose_idx.end(), 0);
4493 
4494   // trt_pad_index denotes the actual idx where the padding is performed by TRT.
4495   std::vector<int> trt_pad_index{nb_dims - 2, nb_dims - 1};
4496 
4497   // How many zeros are padded at the last two dimensions.
4498   nvinfer1::DimsHW pre_padding(0, 0);
4499   nvinfer1::DimsHW post_padding(0, 0);
4500 
4501   // Dimension to set in the pre_padding and post_padding array.
4502   std::vector<int> trt_pre_post_padding_index{0, 1};
4503 
4504   // Two special cases where we can avoid permutations.
4505   if (tf_pad_index.size() == 1 && tf_pad_index[0] == nb_dims - 1) {
4506     // Only one dimension needs to be padded. We store its index at
4507     // trt_pad_index[0]. We ignore trt_pad_index[1].
4508     trt_pad_index[0] = nb_dims - 1;
4509     trt_pre_post_padding_index[0] = 1;
4510   }
4511   if (tf_pad_index.size() == 2 && tf_pad_index[1] == nb_dims - 2) {
4512     // tf_pad_index only has two values that are in ascending order. If
4513     // tf_pad_index[1] is nb_dims-2, then swapping the two values in
4514     // trt_pad_index here makes it possible to only swap one pair of dimensions
4515     // (swap tf_pad_index[0] with nb_dims-1) in the input tensor. Otherwise, we
4516     // would have to swap two pairs of dimensions in the input tensor:
4517     // (tf_pad_index[0] with nb_dims-2) and (tf_pad_index[1], with nb_dims-1).
4518     // Here is an example for a 4D input tensor:
4519     // tf_pad_index = [1, 2]
4520     // trt_pad_index = [3, 2]
4521     // transpose_idx = [0, 3, 2, 1]
4522     std::swap(trt_pad_index[0], trt_pad_index[1]);
4523     std::swap(trt_pre_post_padding_index[0], trt_pre_post_padding_index[1]);
4524   }
4525 
4526   for (int i = 0; i < tf_pad_index.size(); i++) {
4527     const int tf_index = tf_pad_index[i];
4528     const int trt_index = trt_pad_index[i];
4529     const int k = trt_pre_post_padding_index[i];
4530     pre_padding.d[k] = pad_data[tf_index * 2];
4531     post_padding.d[k] = pad_data[tf_index * 2 + 1];
4532     if (tf_index != trt_index) {
4533       transposed_pad = true;
4534       std::swap(transpose_idx[tf_index], transpose_idx[trt_index]);
4535     }
4536   }
4537 
4538   if (transposed_pad) {
4539     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
4540         tensor, transpose_idx, &tensor, node_def, "to_pad"));
4541   }
4542 
4543   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
4544       *tensor->trt_tensor(), pre_padding, post_padding);
4545   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4546   params->converter->SetLayerName(layer, node_def);
4547   ITensorProxyPtr output_tensor = layer->getOutput(0);
4548 
4549   if (transposed_pad) {
4550     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
4551         output_tensor, transpose_idx, &output_tensor, node_def, "from_pad"));
4552   }
4553 
4554   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4555   return Status::OK();
4556 }
4557 
ConvertSplitHelper(OpConverterParams * params,const TRT_TensorOrWeights & input,int tf_axis,int num_splits,bool squeeze_after)4558 Status ConvertSplitHelper(OpConverterParams* params,
4559                           const TRT_TensorOrWeights& input, int tf_axis,
4560                           int num_splits, bool squeeze_after) {
4561   const auto& node_def = params->node_def;
4562   const nvinfer1::Dims dims = input.GetTrtDims();
4563   // Convert axis.
4564   int trt_axis;
4565   TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
4566                                  params->use_implicit_batch, &trt_axis));
4567 
4568   if (dims.d[trt_axis] < 0) {
4569     return errors::InvalidArgument(
4570         "Dimension ", tf_axis, " must have statically defined dimensions, at ",
4571         node_def.name());
4572   }
4573 
4574   // Dimension must equal num_splits for Unstack (when squeeze_after is true)
4575   if (squeeze_after && dims.d[trt_axis] != num_splits) {
4576     return errors::InvalidArgument(
4577         "Dimension ", tf_axis, " has size ", dims.d[trt_axis],
4578         " which is not equal to num of ", num_splits, ", at ", node_def.name());
4579   }
4580   // Dimension must be evenly divisible by num_splits.
4581   if (dims.d[trt_axis] % num_splits != 0) {
4582     return errors::InvalidArgument(
4583         "Dimension ", tf_axis, " of size ", dims.d[trt_axis],
4584         " is not evenly divisible by ", num_splits, ", at ", node_def.name());
4585   }
4586 
4587   // Create parameters for StridedSliceHelper.
4588   // Slice will begin on zero for all dims, except the one being split which
4589   // will change.
4590   std::vector<int> begin(dims.nbDims, 0);
4591   // Determine size of split. Slice will get the full length of all dims, except
4592   // the one being split. Undefined dims (-1) will translate to a size of -1
4593   // which will tell StridedSlice to take full length of that dim.
4594   std::vector<int> size(dims.d, dims.d + dims.nbDims);
4595   const int split_size_on_axis = dims.d[trt_axis] / num_splits;
4596   size[trt_axis] = split_size_on_axis;
4597   // Stride will always be 1
4598   std::vector<int> stride(dims.nbDims, 1);
4599   // Add dummy batch dimension
4600   if (params->use_implicit_batch) {
4601     begin.insert(begin.begin(), 0);
4602     size.insert(size.begin(), 1);
4603     stride.insert(stride.begin(), 1);
4604   }
4605   // Create final shape for Unpack/Unstack, where split axis is squeezed.
4606   nvinfer1::Dims final_shape_for_unpack;
4607   nvinfer1::Dims* final_shape_for_unpack_ptr = nullptr;
4608 
4609   // We can't use final_shape_for_unpack_ptr when input dimensions are not
4610   // fully defined.
4611   const bool is_dynamic_shape = !HasStaticShape(dims);
4612   if (squeeze_after && !is_dynamic_shape) {
4613     std::vector<int> size_after_squeeze(size);
4614     const int tf_axis = trt_axis + (params->use_implicit_batch ? 1 : 0);
4615     size_after_squeeze.erase(size_after_squeeze.begin() + tf_axis);
4616     TF_RETURN_IF_ERROR(ContainerToTrtDims(size_after_squeeze,
4617                                           &final_shape_for_unpack,
4618                                           /*ignore_first_dim=*/
4619                                           params->use_implicit_batch));
4620     final_shape_for_unpack_ptr = &final_shape_for_unpack;
4621   }
4622 
4623   // Slice the input. ConvertStridedSliceHelper will push the outputs onto
4624   // params->outputs.
4625   for (int i = 0; i < num_splits; ++i) {
4626     const int tf_axis = trt_axis + (params->use_implicit_batch ? 1 : 0);
4627     begin[tf_axis] = i * split_size_on_axis;
4628     TF_RETURN_IF_ERROR(ConvertStridedSliceHelper(
4629         params, input, begin, size, stride, final_shape_for_unpack_ptr,
4630         /*op_instance=*/i));
4631   }
4632   if (params->validation_only) return Status::OK();
4633 
4634   // Squeeze for dynamic shapes
4635   if (squeeze_after && is_dynamic_shape) {
4636     for (int i = 0; i < params->outputs->size(); i++) {
4637       ITensorProxyPtr output_tensor = nullptr;
4638       std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
4639       input_dims[trt_axis] = 0;
4640       TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
4641           params->outputs->at(i).tensor(), &input_dims, params,
4642           &output_tensor));
4643       (*params->outputs)[i] = TRT_TensorOrWeights(output_tensor);
4644     }
4645   }
4646   return Status::OK();
4647 }
4648 
ConvertSplit(OpConverterParams * params)4649 Status ConvertSplit(OpConverterParams* params) {
4650   const auto& inputs = params->inputs;
4651   const auto& node_def = params->node_def;
4652   TF_RETURN_IF_ERROR(
4653       CheckInputsWeights(*params, {{"axis", true}, {"value", false}}));
4654   TF_RETURN_IF_ERROR(AllowDataTypes(*params, {
4655                                                  DataType::DT_FLOAT,
4656                                                  DataType::DT_HALF,
4657                                                  DataType::DT_INT32,
4658                                              }));
4659   int tf_axis = inputs.at(0).weights().GetSpan<int>()[0];
4660   TFAttrs attrs(node_def);
4661   const int num_split = attrs.get<int64>("num_split");
4662 
4663   return ConvertSplitHelper(params, inputs.at(1), tf_axis, num_split, false);
4664 }
4665 
ConvertUnpack(OpConverterParams * params)4666 Status ConvertUnpack(OpConverterParams* params) {
4667   const auto& inputs = params->inputs;
4668   const auto& node_def = params->node_def;
4669   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"value", false}}));
4670   TF_RETURN_IF_ERROR(AllowDataTypes(*params, {
4671                                                  DataType::DT_FLOAT,
4672                                                  DataType::DT_HALF,
4673                                                  DataType::DT_INT32,
4674                                              }));
4675   // Input must be rank 1 or higher, since we can't unpack on axis 0.
4676   if (inputs.at(0).GetTrtDims().nbDims == 0) {
4677     return errors::Unimplemented(
4678         "Input \"value\" for Unpack must be rank 2 or greater, at ",
4679         node_def.name());
4680   }
4681   TFAttrs attrs(node_def);
4682   const int tf_axis = attrs.get<int64>("axis");
4683   const int num = attrs.get<int64>("num");
4684 
4685   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
4686 }
4687 
4688 // Supports cast fp16=>fp32 through IIdentityLayer.
ConvertCast(OpConverterParams * params)4689 Status ConvertCast(OpConverterParams* params) {
4690   const NodeDef& node_def = params->node_def;
4691   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
4692   auto unsupport_cast_error = [&]() {
4693     return errors::Unimplemented("Cast op: ", node_def.op(),
4694                                  " not supported at: ", node_def.name());
4695   };
4696 
4697   DataType input_type;
4698   TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
4699   if (input_type != DataType::DT_HALF) {
4700     return unsupport_cast_error();
4701   }
4702 
4703   DataType output_type;
4704   TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type,
4705                                       kCastOutputTypeAttrName));
4706 
4707   if (output_type != DataType::DT_FLOAT) {
4708     return unsupport_cast_error();
4709   }
4710 
4711   if (params->validation_only) return Status::OK();
4712 
4713   ITensorProxyPtr input = params->inputs.at(0).tensor();
4714   nvinfer1::IIdentityLayer* layer =
4715       params->converter->network()->addIdentity(*input->trt_tensor());
4716   params->converter->SetLayerName(layer, node_def);
4717   layer->setPrecision(nvinfer1::DataType::kFLOAT);
4718 
4719   if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
4720     return errors::Internal("IIdentityLayer doesn't work as expected");
4721   }
4722 
4723   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4724   return Status::OK();
4725 }
4726 
ConvertConcat(OpConverterParams * params)4727 Status ConvertConcat(OpConverterParams* params) {
4728   const auto& inputs = params->inputs;
4729   const auto& node_def = params->node_def;
4730   TFAttrs attrs(node_def);
4731   // Get number of tensor inputs.
4732   const int num_inputs = attrs.get<int64>("N");
4733   if (num_inputs != static_cast<int>(inputs.size()) - 1) {
4734     return errors::InvalidArgument(
4735         "Number of inputs for ConcatV2 is inconsistent with N attribute, at ",
4736         node_def.name());
4737   }
4738   // Validate inputs. Values must be tensors for now, although it would be
4739   // possible to accept weights in explicit batch mode. See CheckInputsWeights
4740   // for details. TODO(tfeher): Allow weight input in explicit batch mode.
4741   std::vector<std::pair<string, TrtInputArg>> inputs_kinds;
4742   TrtInputArg expected_input = TrtInputArg::kTensor;
4743   inputs_kinds.reserve(num_inputs);
4744   for (int i = 0; i < num_inputs; ++i) {
4745     inputs_kinds.push_back({StrCat("values_", i), expected_input});
4746   }
4747   inputs_kinds.push_back({"axis", TrtInputArg::kWeight});
4748   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_kinds));
4749 
4750   std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
4751                                    DataType::DT_INT32};
4752 
4753   TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
4754   const auto axis = inputs.at(num_inputs).weights().GetSpan<int>();
4755   if (axis.size() != 1) {
4756     return errors::InvalidArgument("Axis for ConcatV2 must be a scalar, at ",
4757                                    node_def.name());
4758   }
4759   int trt_axis = 0;
4760   const auto dim = inputs.at(0).GetTrtDims();
4761   TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dim.nbDims, node_def.name(),
4762                                  params->use_implicit_batch, &trt_axis));
4763   // Check that dimensions match on non-concatenate axis.
4764   TF_RETURN_IF_ERROR(VerifyShapesMatch(
4765       absl::Span<const TRT_TensorOrWeights>(inputs).first(num_inputs), trt_axis,
4766       node_def.name()));
4767   if (params->validation_only) return Status::OK();
4768 
4769   // Gather inputs as tensors
4770   std::vector<ITensorProxyPtr> input_tensors;
4771   input_tensors.reserve(num_inputs);
4772   for (int i = 0; i < num_inputs; i++) {
4773     input_tensors.push_back(inputs.at(i).tensor());
4774   }
4775   std::vector<nvinfer1::ITensor*> trt_input_tensors;
4776   for (const auto& t : input_tensors) {
4777     trt_input_tensors.push_back(t->trt_tensor());
4778   }
4779   nvinfer1::IConcatenationLayer* layer =
4780       params->converter->network()->addConcatenation(
4781           static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
4782           input_tensors.size());
4783   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4784   params->converter->SetLayerName(layer, node_def);
4785   layer->setAxis(trt_axis);
4786   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
4787   return Status::OK();
4788 }
4789 
ConvertFusedBatchNorm(OpConverterParams * params)4790 Status ConvertFusedBatchNorm(OpConverterParams* params) {
4791   const auto& inputs = params->inputs;
4792   const auto& node_def = params->node_def;
4793   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false},
4794                                                   {"scale", true},
4795                                                   {"offset", true},
4796                                                   {"mean", true},
4797                                                   {"variance", true}}));
4798   TF_RETURN_IF_ERROR(
4799       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
4800   TFAttrs attrs(node_def);
4801   float epsilon = attrs.get<float>("epsilon");
4802   auto data_format = attrs.get<string>("data_format");
4803   if (data_format != "NCHW") {
4804     return errors::Unimplemented(
4805         node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
4806   }
4807   bool is_training = attrs.get<bool>("is_training");
4808   if (is_training) {
4809     // Trying to use batchnorm in training mode is a very common problem.
4810     // Because the error message will only be printed in VLOG(1) by the
4811     // segmenter, we issue a special warning so that users will actually see it.
4812     LOG_WARNING_WITH_PREFIX
4813         << node_def.op() << " only supports is_training=false. If you "
4814         << "are using Keras, please call "
4815         << "keras.backend.set_learning_phase(0) before constructing "
4816         << "your model. At " << node_def.name();
4817     return errors::Unimplemented(node_def.op(),
4818                                  " only supports is_training=false, at ",
4819                                  node_def.name());
4820   }
4821   ITensorProxyPtr tensor = inputs.at(0).tensor();
4822   if (!params->use_implicit_batch && tensor->getDimensions().d[1] == -1) {
4823     // This check is to make sure that channel dimension is known during
4824     // conversion.
4825     //
4826     // We check this only in explicit batch mode and reject an op with unknown
4827     // channel dimension during segmentation. In implicit batch mode we have
4828     // known shapes during conversion even though the shapes may not be known
4829     // during segmentation (see the actual argument for input_shapes when
4830     // ConvertGraphDefToEngine is called from TRTEngineOp::BuildEngine).
4831     return errors::InvalidArgument("Channel dimension must be static, at ",
4832                                    node_def.name());
4833   }
4834   //  Check parameter types
4835   auto parameter_type = inputs.at(1).weights().TrtDType();
4836   if ((parameter_type != nvinfer1::DataType::kFLOAT) &&
4837       (parameter_type != nvinfer1::DataType::kHALF)) {
4838     return errors::Unimplemented(
4839         "Only float32 or float16 weight data type is supported, for node ",
4840         node_def.name(), " got ", DebugString(parameter_type));
4841   }
4842   for (int i = 1; i < 5; i++) {
4843     if (inputs.at(i).weights().TrtDType() != parameter_type) {
4844       return errors::Unimplemented(
4845           "Inconsistent parameter type for batchnorm is not supported, at: " +
4846           node_def.name());
4847     }
4848   }
4849 
4850   TRT_ShapedWeights dummy_power_weights(parameter_type);
4851   size_t nweight = 0;
4852   for (int i = 1; i < 5; i++) {
4853     nweight = std::max<size_t>(nweight, inputs.at(i).weights().count());
4854   }
4855   const TRT_ShapedWeights* ptr_shape_weights = nullptr;
4856   for (int i = 1; i < 5; i++) {
4857     if (inputs.at(i).weights().count() == nweight) {
4858       ptr_shape_weights = &(inputs.at(i).weights());
4859     } else if (inputs.at(i).weights().count() != 1) {
4860       return errors::InvalidArgument(
4861           "Inconsistent batchnorm parameter count, at: " + node_def.name());
4862     }
4863   }
4864   if (params->validation_only) return Status::OK();
4865 
4866   //  We could technically have two weights with different shape.
4867   //  that requires two addScale op, arguably less performant
4868   TRT_ShapedWeights combined_scale_weights =
4869       params->weight_store->GetTempWeights(*ptr_shape_weights);
4870   TRT_ShapedWeights combined_offset_weights =
4871       params->weight_store->GetTempWeights(*ptr_shape_weights);
4872 
4873   const Eigen::half* cast_vals_array[4];
4874   const float* vals_array[4];
4875   for (int j = 0; j < 4; j++) {
4876     cast_vals_array[j] =
4877         static_cast<Eigen::half const*>(inputs.at(j + 1).weights().GetValues());
4878     vals_array[j] =
4879         static_cast<float const*>(inputs.at(j + 1).weights().GetValues());
4880   }
4881   Eigen::half* cast_combined_scale_vals =
4882       static_cast<Eigen::half*>(combined_scale_weights.GetValues());
4883   Eigen::half* cast_combined_offset_vals =
4884       static_cast<Eigen::half*>(combined_offset_weights.GetValues());
4885   float* combined_scale_vals =
4886       static_cast<float*>(combined_scale_weights.GetValues());
4887   float* combined_offset_vals =
4888       static_cast<float*>(combined_offset_weights.GetValues());
4889 
4890   for (size_t i = 0; i < nweight; ++i) {
4891     float batchnorm_data[4];
4892     for (int j = 0; j < 4; j++) {
4893       if (inputs.at(j + 1).weights().count() != 1) {
4894         if (parameter_type == nvinfer1::DataType::kFLOAT) {
4895           batchnorm_data[j] = vals_array[j][i];
4896         } else if (parameter_type == nvinfer1::DataType::kHALF) {
4897           batchnorm_data[j] = static_cast<float>(cast_vals_array[j][i]);
4898         }
4899       } else {
4900         if (parameter_type == nvinfer1::DataType::kFLOAT) {
4901           batchnorm_data[j] = vals_array[j][0];
4902         } else if (parameter_type == nvinfer1::DataType::kHALF) {
4903           batchnorm_data[j] = static_cast<float>(cast_vals_array[j][0]);
4904         }
4905       }
4906     }
4907     float scale = batchnorm_data[0];
4908     float offset = batchnorm_data[1];
4909     float mean = batchnorm_data[2];
4910     float variance = batchnorm_data[3];
4911     float combined_scale_val = scale / sqrtf(variance + epsilon);
4912     float combined_offset_val = offset - mean * combined_scale_val;
4913     if (parameter_type == nvinfer1::DataType::kFLOAT) {
4914       combined_scale_vals[i] = combined_scale_val;
4915       combined_offset_vals[i] = combined_offset_val;
4916     } else if (parameter_type == nvinfer1::DataType::kHALF) {
4917       cast_combined_scale_vals[i] = Eigen::half(combined_scale_val);
4918       cast_combined_offset_vals[i] = Eigen::half(combined_offset_val);
4919     }
4920   }
4921 
4922   nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
4923                                           : nvinfer1::ScaleMode::kCHANNEL;
4924   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
4925       *tensor->trt_tensor(), mode, combined_offset_weights.GetTrtWeights(),
4926       combined_scale_weights.GetTrtWeights(),
4927       dummy_power_weights.GetTrtWeights());
4928   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
4929   params->converter->SetLayerName(layer, node_def);
4930   ITensorProxyPtr output_tensor = layer->getOutput(0);
4931   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
4932   return Status::OK();
4933 }
4934 
ConvertGather(OpConverterParams * params)4935 Status ConvertGather(OpConverterParams* params) {
4936   const auto& inputs = params->inputs;
4937   const auto& node_def = params->node_def;
4938   // TODO(tmorris): Use CheckInputsWeights by changing bool to enum with an
4939   // option for an input to be either tensor or weight.
4940   TF_RETURN_IF_ERROR(
4941       CheckInputsWeights(*params, {{"params", TrtInputArg::kBoth},
4942                                    {"indices", TrtInputArg::kTensor},
4943                                    {"axis", TrtInputArg::kWeight}}));
4944 
4945   const auto& params_input = inputs.at(0);
4946   const auto& indices_input = inputs.at(1);
4947   const auto& axis_input = inputs.at(2);
4948 
4949   TF_RETURN_IF_ERROR(AllowDataTypes(
4950       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32},
4951       /*dtype_attr_name=*/"Tparams"));
4952   TF_RETURN_IF_ERROR(AllowDataTypes(*params, {DataType::DT_INT32},
4953                                     /*dtype_attr_name=*/"Tindices"));
4954 
4955   absl::Span<const int> axis = axis_input.weights().GetSpan<int>();
4956   if (axis.size() != 1) {
4957     return errors::InvalidArgument("Axis for GatherV2 must be a scalar, at ",
4958                                    node_def.name());
4959   }
4960   int trt_axis = 0;
4961   TF_RETURN_IF_ERROR(ConvertAxis(
4962       axis[0], params_input.GetTrtDims().nbDims, node_def.name(),
4963       params->use_implicit_batch && params_input.is_tensor(), &trt_axis));
4964   if (params->use_implicit_batch && params_input.is_weights() &&
4965       trt_axis != 0) {
4966     return errors::Unimplemented(
4967         "The input axis must be zero when params is a weight.");
4968   }
4969   if (params->use_implicit_batch && params_input.is_tensor() &&
4970       indices_input.batch_size() != 1) {
4971     return errors::Unimplemented(
4972         "Indices must have a batch size of 1 when params is a tensor.");
4973   }
4974   // Both input are tensors, and the TF gather result will have rank:
4975   // (params.nbDims + 1) + (indices.nbDims + 1) - 1,
4976   // where "+ 1" adds the batch dim. If params is a weight, the TRT rank matches
4977   // the TF rank so we don't have to add + 1.
4978   const int params_tf_rank =
4979       params_input.GetTrtDims().nbDims +
4980       (params->use_implicit_batch && params_input.is_tensor() ? 1 : 0);
4981   const int indices_tf_rank =
4982       indices_input.GetTrtDims().nbDims + (params->use_implicit_batch ? 1 : 0);
4983   const int tf_gather_output_rank = params_tf_rank + indices_tf_rank - 1;
4984   if (tf_gather_output_rank >
4985       nvinfer1::Dims::MAX_DIMS + (params->use_implicit_batch ? 1 : 0)) {
4986     return errors::InvalidArgument(
4987         "Result of gather has dimension greater than ",
4988         nvinfer1::Dims::MAX_DIMS + 1);
4989   }
4990   if (params->validation_only) return Status::OK();
4991 
4992   // Convert params to tensor is it is a weight.
4993   ITensorProxyPtr params_tensor = nullptr;
4994   if (params_input.is_weights()) {
4995     params_tensor = params->converter->CreateConstantLayer(
4996         params_input.weights(), params_input.GetTrtDims());
4997   } else {
4998     params_tensor = params_input.tensor();
4999   }
5000 
5001   // Note on how IGatherLayer works: if both the data and indices tensors have
5002   // a batch size dimension of size N, it performs:
5003   // for batchid in xrange(N):
5004   //   output[batchid, a0, ..., an, i, ..., j, b0, ..., bn] = (
5005   //       data[batchid, a0, ..., an, indices[batchid, i, ..., j] b0, ..., bn])
5006   nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
5007       *params_tensor->trt_tensor(), *indices_input.tensor()->trt_tensor(),
5008       trt_axis);
5009   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5010   params->converter->SetLayerName(layer, node_def);
5011 
5012   ITensorProxyPtr output_tensor = layer->getOutput(0);
5013   nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions();
5014   // Note for the "- 2": one is for the output batch dim encapsulated by TF-TRT,
5015   // and the other is for the output dimension that is squeezed by IGatherLayer
5016   // because of the implicit batch dim in the indices (see the above note).
5017   const int expected_trt_output_rank =
5018       tf_gather_output_rank - (params_input.is_tensor() ? 2 : 1);
5019   if (params->use_implicit_batch &&
5020       trt_gather_output_dims.nbDims != expected_trt_output_rank) {
5021     return errors::Internal(
5022         "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ",
5023         expected_trt_output_rank,
5024         ", actual nbDims: ", trt_gather_output_dims.nbDims);
5025   }
5026   // Reshape the output so after adding the implicit batch dim it'll match the
5027   // output shape of TF GatherV2.
5028   if (params->use_implicit_batch && params_input.is_tensor()) {
5029     for (int i = trt_gather_output_dims.nbDims; i > trt_axis; --i) {
5030       trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i - 1];
5031     }
5032     trt_gather_output_dims.d[trt_axis] = 1;
5033     ++trt_gather_output_dims.nbDims;
5034 
5035     TF_RETURN_IF_ERROR(PrepareTensorForShape(
5036         params->converter, TRT_TensorOrWeights(output_tensor),
5037         trt_gather_output_dims,
5038         /*validation_only=*/false, &output_tensor, node_def));
5039   }
5040 
5041   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5042   return Status::OK();
5043 }
5044 
5045 // Converts the input matrix multiplication node to a fully connected (FC) layer
5046 // if possible, as the FC layer has more tactics and INT implementations.
5047 // Returns the output ITensor* if the node is converted or nullptr if conversion
5048 // is not possible. An error status indicates internal problems during
5049 // conversion.
ConvertFullyConnectedImpl(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5050 StatusOr<ITensorProxyPtr> ConvertFullyConnectedImpl(OpConverterParams* params,
5051                                                     TRT_TensorOrWeights input_a,
5052                                                     TRT_TensorOrWeights input_b,
5053                                                     bool transpose_a,
5054                                                     bool transpose_b) {
5055   if (!(!transpose_a && input_a.is_tensor() && input_b.is_weights())) {
5056     VLOG(2) << "Not FC compatible, A must be non transposed tensor, and B "
5057                "must be constant.";
5058     return ITensorProxyPtr(nullptr);
5059   }
5060 
5061   if (!params->use_implicit_batch && input_b.GetTrtDims().nbDims > 2 &&
5062       input_b.GetTrtDims().d[0] != 1) {
5063     // Implicit broadcasting, if needed, has already been considered to
5064     // transform the inputs and ensure the two operands have the same rank here.
5065     // If the inputs have rank >= 3, then d[0] is the explicit batch dimension.
5066     // The weight (input_b) must have batch size 1 in implicit batch mode.
5067     VLOG(2) << "Not FC compatible, if B has an explicit batch dimension, then "
5068                "it must be 1.";
5069     return ITensorProxyPtr(nullptr);
5070   }
5071 
5072   nvinfer1::Dims input_dim = input_a.GetTrtDims();
5073   if (input_dim.d[input_dim.nbDims - 1] == -1) {
5074     VLOG(2) << "Not FC compatible, last dim of A must be static.";
5075     return ITensorProxyPtr(nullptr);
5076   }
5077 
5078   if (input_dim.nbDims + 2 > nvinfer1::Dims::MAX_DIMS) {
5079     VLOG(2) << "Not FC compatible, cannot expand A's shape.";
5080     return ITensorProxyPtr(nullptr);
5081   }
5082 
5083   // Add two trailing 1's because FC layer combines the last three dims.
5084   ITensorProxyPtr tensor_a = nullptr;
5085   nvinfer1::Dims reshape_dim{input_dim.nbDims + 2, {}};
5086   // The empty braces initialize the elements of reshap_dim.d to 0. A value 0 in
5087   // reshape_dim.d[i] will preserve the i-th dimension value from the shape of
5088   // input_a.
5089   reshape_dim.d[input_dim.nbDims] = 1;
5090   reshape_dim.d[input_dim.nbDims + 1] = 1;
5091   const NodeDef& node_def = params->node_def;
5092   TF_RETURN_IF_ERROR(PrepareTensorForShape(
5093       params->converter, input_a, reshape_dim,
5094       /*validation_only=*/false, &tensor_a, node_def, /*op_instance=*/0,
5095       /*origin_node_name=*/"FULLY_CONNECTED"));
5096 
5097   VLOG(2) << "New shape of A " << DebugString(tensor_a->getDimensions());
5098 
5099   TRT_ShapedWeights weights_b = input_b.weights();
5100   TRT_ShapedWeights weights_2D(weights_b);
5101   if (weights_b.shape_.nbDims > 2) {
5102     // Combine first nbDims-1 dims into a single dim, e.g. for a 4D tensor we
5103     // transform [N, H, W, C] -> [N*H*W, C]. This is only valid if all batch
5104     // dimensions are 1.
5105     if (std::any_of(weights_b.shape_.d,
5106                     weights_b.shape_.d + weights_b.shape_.nbDims - 2,
5107                     [](int d) { return d != 1; })) {
5108       VLOG(2) << "Not FC compatible, B has a batch dim larger than 1";
5109       return ITensorProxyPtr(nullptr);
5110     }
5111     int k = weights_b.shape_.d[weights_b.shape_.nbDims - 1];
5112     nvinfer1::Dims dims{2, {static_cast<int>(weights_b.count() / k), k}};
5113     TF_RETURN_IF_ERROR(weights_2D.SetShape(dims));
5114   }
5115 
5116   // FC layer will transpose weights, so we need to pre-transpose.
5117   TRT_ShapedWeights weights(weights_2D.TrtDType());
5118   if (!transpose_b) {
5119     weights = params->weight_store->GetTempWeights(weights_2D);
5120     ReorderCKtoKC(weights_2D, &weights);
5121   } else {
5122     weights = weights_2D;
5123   }
5124   TRT_ShapedWeights biases(weights.TrtDType());
5125   int k = weights.shape_.d[weights.shape_.nbDims - 1];
5126   const int noutput = weights.count() / k;
5127   VLOG(2) << "Using fully connected layer with k=" << k
5128           << ", n_output=" << noutput
5129           << " weights shape: " << DebugString(weights.shape_) << " to convert "
5130           << node_def.op();
5131   nvinfer1::IFullyConnectedLayer* layer =
5132       params->converter->network()->addFullyConnected(
5133           *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(),
5134           biases.GetTrtWeights());
5135 
5136   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5137   params->converter->SetLayerName(layer, node_def);
5138   ITensorProxyPtr output_tensor = layer->getOutput(0);
5139 
5140   // A fully connected layer produces output with two trailing singleton
5141   // dimensions. We remove these.
5142   auto output_dim = output_tensor->getDimensions();
5143   output_dim.nbDims -= 2;
5144   // A zero in output_dim indicates copying the corresponding input dimension
5145   // value during reshape.
5146   std::fill(output_dim.d, output_dim.d + output_dim.nbDims, 0);
5147   TF_RETURN_IF_ERROR(PrepareTensorForShape(
5148       params->converter, TRT_TensorOrWeights(output_tensor), output_dim,
5149       /*validation_only=*/false, &output_tensor, node_def,
5150       /*op_instance=*/1, /*origin_node_name=*/"FULLY_CONNECTED"));
5151   return output_tensor;
5152 }
5153 
ConvertMatMulImpl(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5154 StatusOr<ITensorProxyPtr> ConvertMatMulImpl(OpConverterParams* params,
5155                                             TRT_TensorOrWeights input_a,
5156                                             TRT_TensorOrWeights input_b,
5157                                             bool transpose_a,
5158                                             bool transpose_b) {
5159   if (params->use_implicit_batch) {
5160     // In implicit batch mode we are very limited when can we multiply 2D
5161     // matrices. If input_A is a 2D tensor, then nbDims==1 (implicit batch dim
5162     // not counted). If A is not transposed and B is weight, then we can convert
5163     // this treating A as a batch of vectors. This is the only possibility
5164     // to implement MatMul with 2D input in implicit batch mode.
5165     if ((input_a.GetTrtDims().nbDims < 2 &&
5166          (transpose_a || !input_b.is_weights())) ||
5167         (input_b.GetTrtDims().nbDims < 2)) {
5168       return errors::InvalidArgument(
5169           "MatMul with 2D tensors requires explicit batch mode, or that tensor"
5170           " A is not transposed and B is a constant tensor.");
5171     }
5172   }
5173 
5174   if (params->validation_only) return ITensorProxyPtr(nullptr);
5175 
5176   StatusOr<ITensorProxyPtr> result = ConvertFullyConnectedImpl(
5177       params, input_a, input_b, transpose_a, transpose_b);
5178   TF_RETURN_IF_ERROR(result.status());
5179   ITensorProxyPtr output = result.ValueOrDie();
5180   if (*output) {
5181     // FC conversion was successful, we can return.
5182     return output;
5183   }
5184   const auto convert_to_itensor =
5185       [&params](TRT_TensorOrWeights operand) -> ITensorProxyPtr {
5186     if (operand.is_tensor()) {
5187       return operand.tensor();
5188     } else {
5189       return params->converter->CreateConstantLayer(operand.weights(),
5190                                                     operand.GetTrtDims());
5191     }
5192   };
5193 
5194   ITensorProxyPtr tensor_a = convert_to_itensor(input_a);
5195   ITensorProxyPtr tensor_b = convert_to_itensor(input_b);
5196 
5197   const auto get_matrix_op = [](ITensorProxyPtr in,
5198                                 bool transpose) -> nvinfer1::MatrixOperation {
5199     return (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
5200                        : nvinfer1::MatrixOperation::kNONE;
5201   };
5202   nvinfer1::MatrixOperation op_a, op_b;
5203   // Note: In implicit batch mode kTRANSPOSE and kNONE are only valid if the
5204   // matrix has at least 2 non-batch dimension. In implicit batch mode, if a has
5205   // 1 dim (excluding batch dim), then we can only use kVECTOR, which will treat
5206   // matrix A as a batch of vectors.
5207   op_a = (tensor_a->getDimensions().nbDims < 2)
5208              ? nvinfer1::MatrixOperation::kVECTOR
5209              : get_matrix_op(tensor_a, transpose_a);
5210   // In implicit batch mode, if B has only 1 dims (excluding batch dim) then we
5211   // already reject the case and don't convert. One could consider using the
5212   // kVECTOR flag to express C = MatMul(A, B.T) if A is weight, but the result
5213   // will not have the correct shape: in TRT's implicit batch implementation,
5214   // the result is a batch of vectors D_ji = A_ik * B_jk, where j is the batch
5215   // dimension. In contrast, the TF MatMul op produces C = D.T, and we cannot
5216   // transpose over the batch dimension (implicit batch mode).
5217   op_b = get_matrix_op(tensor_b, transpose_b);
5218 
5219   nvinfer1::IMatrixMultiplyLayer* layer =
5220       params->converter->network()->addMatrixMultiply(
5221           *tensor_a->trt_tensor(), op_a, *tensor_b->trt_tensor(), op_b);
5222 
5223   const auto& node_def = params->node_def;
5224   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5225   params->converter->SetLayerName(layer, node_def);
5226   return ITensorProxyPtr(layer->getOutput(0));
5227 }
5228 
ConvertMatMulHelper(OpConverterParams * params,TRT_TensorOrWeights input_a,TRT_TensorOrWeights input_b,bool transpose_a,bool transpose_b)5229 Status ConvertMatMulHelper(OpConverterParams* params,
5230                            TRT_TensorOrWeights input_a,
5231                            TRT_TensorOrWeights input_b, bool transpose_a,
5232                            bool transpose_b) {
5233   StatusOr<ITensorProxyPtr> result =
5234       ConvertMatMulImpl(params, input_a, input_b, transpose_a, transpose_b);
5235   TF_RETURN_IF_ERROR(result.status());
5236   if (!params->validation_only) {
5237     params->outputs->push_back(TRT_TensorOrWeights(result.ValueOrDie()));
5238   }
5239   return Status::OK();
5240 }
5241 
5242 // inputs are both two dimensional (ops::MatMul)
ConvertMatMul(OpConverterParams * params)5243 Status ConvertMatMul(OpConverterParams* params) {
5244   const auto& inputs = params->inputs;
5245   const auto& node_def = params->node_def;
5246   if (inputs.size() != 2) {
5247     return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
5248                                    " inputs but expected 2, at ",
5249                                    node_def.name());
5250   }
5251   TF_RETURN_IF_ERROR(
5252       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5253 
5254   TFAttrs attrs(node_def);
5255   bool transpose_a = attrs.get<bool>("transpose_a");
5256   bool transpose_b = attrs.get<bool>("transpose_b");
5257 
5258   return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a,
5259                              transpose_b);
5260 }
5261 
ConvertBatchMatMul(OpConverterParams * params)5262 Status ConvertBatchMatMul(OpConverterParams* params) {
5263   const auto& inputs = params->inputs;
5264   const auto& node_def = params->node_def;
5265   if (inputs.size() != 2) {
5266     return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
5267                                    " inputs but expected 2, at ",
5268                                    node_def.name());
5269   }
5270   TF_RETURN_IF_ERROR(CheckInputsWeights(
5271       *params, {{"x", TrtInputArg::kBoth}, {"y", TrtInputArg::kBoth}}));
5272   // TODO(tfeher): Consider adding INT8 type because FC layer can support it.
5273   TF_RETURN_IF_ERROR(
5274       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5275   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
5276     return errors::InvalidArgument(
5277         "All inputs are weights, but Grappler is expected to fold them.");
5278   }
5279 
5280   TFAttrs attrs(node_def);
5281   const bool transpose_a = attrs.get<bool>("adj_x");
5282   const bool transpose_b = attrs.get<bool>("adj_y");
5283 
5284   // In case input_l is weight, check whether input_l has implicit batch mode
5285   // compatible batch dim.
5286   const auto check_weight_is_not_batched =
5287       [](const TRT_TensorOrWeights& input_l,
5288          const TRT_TensorOrWeights& input_r) {
5289         // There is no way to batch constants in TRT using implicit batch mode.
5290         // Example:
5291         // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3
5292         // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6
5293         // It is not possible to treat the weight input as a batched [3, 6]
5294         // tensor. Batched weight tensors must have batch dim = 1 (after the
5295         // broadcast).
5296         if (input_l.is_weights() &&
5297             input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims &&
5298             input_l.GetTrtDims().d[0] != 1) {
5299           return errors::Unimplemented(
5300               "TensorRT does not support batched constants in implicit batch "
5301               "mode.");
5302         }
5303         return Status::OK();
5304       };
5305   if (params->use_implicit_batch) {
5306     TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1)));
5307     TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0)));
5308   }
5309 
5310   // Broadcast inputs. We don't check feasibility since the dimensions in a
5311   // MatMul don't need to match. For example, consider a valid set of inputs
5312   // which would produce an output of shape [N, T, K]:
5313   // input 0: [N, T, C]
5314   // input 1: [1, C, K]
5315   // Since C != K and T != C, check feasiblity would fail.
5316   auto input_l = std::make_unique<TRT_TensorOrWeights>(inputs.at(0));
5317   auto input_r = std::make_unique<TRT_TensorOrWeights>(inputs.at(1));
5318   TF_RETURN_IF_ERROR(BroadcastTensors(input_l, input_r,
5319                                       /*check_feasibility=*/false, params));
5320 
5321   if (params->validation_only) return Status::OK();
5322 
5323   return ConvertMatMulHelper(params, *input_l, *input_r, transpose_a,
5324                              transpose_b);
5325 }
5326 
5327 // Finds the indices of elements in [begin, end) in array
5328 // [array_begin, array_end), and appends the indices to permute. This is used to
5329 // construct the permutation sequence for the operand with input labels
5330 // [array_begin, array_end) to the desired permuted labels [begin, end).
5331 template <typename Iterator>
FindIndices(Iterator begin,Iterator end,Iterator array_begin,Iterator array_end,std::vector<int> * permute)5332 Status FindIndices(Iterator begin, Iterator end, Iterator array_begin,
5333                    Iterator array_end, std::vector<int>* permute) {
5334   const int n = array_end - array_begin;
5335   if (n < end - begin) {
5336     return errors::Internal("Incorrect array size");
5337   }
5338   for (auto i = begin; i < end; i++) {
5339     int idx = std::find(array_begin, array_end, *i) - array_begin;
5340     if (idx >= n) {
5341       return errors::Internal("Label not found");
5342     }
5343     permute->push_back(idx);
5344   }
5345   return Status::OK();
5346 }
5347 
5348 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
5349 // Layout of the einsum dimensions: Batch, Free and Contraction indices.
5350 // Example: abcd,adef -> abde. The first tensor has layout BFC, the second BCF.
5351 enum class EinsumLayout { BFC, BCF, MIX };
5352 
5353 // Describes an operand: input shape, number of batch, free and contract
5354 // dimensions, and the permutation that is needed to bring it to a matmul
5355 // compatible form.
5356 struct EinsumDescriptor {
EinsumDescriptortensorflow::tensorrt::convert::EinsumDescriptor5357   EinsumDescriptor() : b(0), f(0), c(0) {}
5358 
5359   // Deduces the number of batch, free, contract dimensions from the input
5360   // labels, decides what layout to use, and determines permutation indices for
5361   // that layout.
InitDescriptortensorflow::tensorrt::convert::EinsumDescriptor5362   Status InitDescriptor(const TRT_TensorOrWeights& operand, Labels input_labels,
5363                         std::vector<EinsumHelper::DimensionType>& label_types,
5364                         EinsumLayout preferred_layout,
5365                         EinsumDescriptor* other = nullptr) {
5366     if (preferred_layout == EinsumLayout::MIX)
5367       return errors::Internal("Preferred einsum layout cannot be MIX");
5368     const EinsumHelper::DimensionType kBatch =
5369         EinsumHelper::DimensionType::kBatch;
5370     const EinsumHelper::DimensionType kFree =
5371         EinsumHelper::DimensionType::kFree;
5372     const EinsumHelper::DimensionType kContract =
5373         EinsumHelper::DimensionType::kContract;
5374 
5375     // Map label indices to label types.
5376     std::vector<EinsumHelper::DimensionType> types;  // Input label types.
5377     std::transform(input_labels.begin(), input_labels.end(),
5378                    std::back_inserter(types),
5379                    [&label_types, kBatch](int i) { return label_types.at(i); });
5380 
5381     using label_t_iterator = std::vector<EinsumHelper::DimensionType>::iterator;
5382     auto count_labels = [](label_t_iterator begin, label_t_iterator end,
5383                            EinsumHelper::DimensionType val) {
5384       return std::count_if(begin, end, [val](EinsumHelper::DimensionType t) {
5385         return t == val;
5386       });
5387     };
5388 
5389     b = count_labels(types.begin(), types.end(), kBatch);
5390     f = count_labels(types.begin(), types.end(), kFree);
5391     c = count_labels(types.begin(), types.end(), kContract);
5392 
5393     if (c == 0 || f == 0) {
5394       VLOG(2) << "Einsum equation needs to have at least one free and one "
5395                  "contract dimension";
5396       return errors::Unimplemented("No conversion for einsum equation.");
5397     }
5398 
5399     // Checks whether input_labels[offset:offset+m] matches labels from other.
5400     auto order_matches = [other, &input_labels, kBatch, kFree, kContract](
5401                              int offset, int m,
5402                              EinsumHelper::DimensionType dim_type) {
5403       if (!other) return true;
5404       int offset_other = 0;
5405       if (dim_type == kFree)
5406         offset = other->offset_f;
5407       else if (dim_type == kContract)
5408         offset = other->offset_c;
5409       return std::equal(input_labels.begin() + offset,
5410                         input_labels.begin() + offset + m,
5411                         other->permuted_labels.begin() + offset_other);
5412     };
5413 
5414     // Check if the current layout is BFC or BCF. In that case we could avoid
5415     // transpose.
5416     layout = EinsumLayout::MIX;
5417     if (count_labels(types.begin(), types.begin() + b, kBatch) == b &&
5418         order_matches(0, b, kBatch)) {
5419       // Batch dims are the leading dims. They have the same order as other.
5420       if (count_labels(types.begin() + b, types.begin() + b + f, kFree) == f) {
5421         // All the free dims are placed consecutively after the batch dims.
5422         // Their order is arbitrary. The final transpose will ensure that the
5423         // output has correct order. We still have to check that the contract
5424         // indices have correct order.
5425         if (order_matches(b + f, c, kContract)) {
5426           layout = EinsumLayout::BFC;
5427         }
5428       } else if (count_labels(types.begin() + b, types.begin() + b + c,
5429                               kContract) == c) {
5430         // All the contract dims are placed consecutively after the batch
5431         // dims. Check whether the contract dims have the same order as the
5432         // contract dims in other.
5433         if (order_matches(b, c, kContract)) {
5434           layout = EinsumLayout::BCF;
5435         }
5436       }
5437     }
5438 
5439     if (layout == EinsumLayout::MIX) {
5440       // Input label types are mixed. Calculate a permutation that maps them
5441       // to the preferred layout (BCF or BFC).
5442       layout = preferred_layout;
5443       if (!other) {
5444         AppendMatchingIndicesToPermute(types, kBatch);
5445       } else {
5446         TF_RETURN_IF_ERROR(
5447             FindIndices(other->permuted_labels.begin(),
5448                         other->permuted_labels.begin() + other->b,
5449                         input_labels.begin(), input_labels.end(), &permute));
5450       }
5451       if (layout == EinsumLayout::BFC) {
5452         AppendMatchingIndicesToPermute(types, kFree);
5453         if (!other) {
5454           AppendMatchingIndicesToPermute(types, kContract);
5455         } else {
5456           TF_RETURN_IF_ERROR(FindIndices(
5457               other->permuted_labels.begin() + other->offset_c,
5458               other->permuted_labels.begin() + other->offset_c + other->c,
5459               input_labels.begin(), input_labels.end(), &permute));
5460         }
5461       } else {
5462         if (!other) {
5463           AppendMatchingIndicesToPermute(types, kContract);
5464         } else {
5465           TF_RETURN_IF_ERROR(FindIndices(
5466               other->permuted_labels.begin() + other->offset_c,
5467               other->permuted_labels.begin() + other->offset_c + other->c,
5468               input_labels.begin(), input_labels.end(), &permute));
5469         }
5470         AppendMatchingIndicesToPermute(types, kFree);
5471       }
5472     }
5473 
5474     if (layout == EinsumLayout::BFC) {
5475       offset_f = b;
5476       offset_c = f + b;
5477     } else {
5478       offset_f = b + c;
5479       offset_c = b;
5480     }
5481 
5482     dims = operand.GetTrtDims();
5483     for (int i = 0; i < b; i++) {
5484       // Set unknown batch dims to zero. These dims will be used in reshape op,
5485       // where zero is a special value for retaining the original dim size.
5486       if (dims.d[i] == -1) dims.d[i] = 0;
5487     }
5488     permuted_labels = input_labels;
5489     if (!permute.empty()) {
5490       // Apply the permutation on the dimension array.
5491       nvinfer1::Dims orig_dims = dims;
5492       for (int i = 0; i < permute.size(); i++) {
5493         dims.d[i] = orig_dims.d[permute[i]];
5494         permuted_labels[i] = input_labels[permute[i]];
5495       }
5496     }
5497     size_tensors.resize(dims.nbDims, nullptr);
5498 
5499     VLOG(2) << "Set up descriptor with "
5500             << (layout == EinsumLayout::BFC ? "BFC" : "BCF")
5501             << " layout, b=" << b << ", f=" << f << ", c=" << c;
5502     return Status::OK();
5503   }
5504 
5505   // Appends indices where types maches value.
AppendMatchingIndicesToPermutetensorflow::tensorrt::convert::EinsumDescriptor5506   void AppendMatchingIndicesToPermute(
5507       const std::vector<EinsumHelper::DimensionType>& types,
5508       EinsumHelper::DimensionType val) {
5509     for (int i = 0; i < types.size(); i++) {
5510       if (types[i] == val) {
5511         permute.push_back(i);
5512       }
5513     }
5514   }
5515 
5516   // Returns whether the free and contract dimension have static shape.
HasStaticShapetensorflow::tensorrt::convert::EinsumDescriptor5517   bool HasStaticShape() {
5518     return !std::any_of(dims.d + b, dims.d + dims.nbDims,
5519                         [](int k) { return k == -1; });
5520   }
5521 
GetPermutationtensorflow::tensorrt::convert::EinsumDescriptor5522   nvinfer1::Permutation GetPermutation() {
5523     nvinfer1::Permutation p;
5524     std::copy(permute.begin(), permute.end(), p.order);
5525     return p;
5526   }
5527 
SetDynamicSizetensorflow::tensorrt::convert::EinsumDescriptor5528   Status SetDynamicSize(OpConverterParams* params,
5529                         const TRT_TensorOrWeights& operand) {
5530     if (operand.GetTrtDims().nbDims != dims.nbDims)
5531       return errors::Internal("Operand dims must agree with descirptor dims");
5532 
5533     if (operand.is_weights()) {
5534       for (int i = 0; i < operand.GetTrtDims().nbDims; i++) {
5535         // dims.d stores the permuted dims.
5536         TF_RETURN_IF_ERROR(
5537             CreateScalarConstant(params, dims.d[i], &size_tensors[i]));
5538       }
5539       return Status::OK();
5540     }
5541     auto* shape_layer =
5542         params->converter->network()->addShape(*operand.tensor()->trt_tensor());
5543     TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
5544     ITensorProxyPtr shape = shape_layer->getOutput(0);
5545     for (int i = 0; i < operand.GetTrtDims().nbDims; i++) {
5546       int idx = permute.empty() ? i : permute.at(i);
5547       auto* layer = params->converter->network()->addSlice(
5548           *shape->trt_tensor(), {1, {idx}}, {1, {1}}, {1, {1}});
5549       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5550       size_tensors[i] = layer->getOutput(0);
5551       TFTRT_RETURN_ERROR_IF_NULLPTR(size_tensors[i], "error, slice is nullptr");
5552     }
5553     return Status::OK();
5554   }
5555 
5556   EinsumLayout layout;
5557   int b;  // number of batch dims
5558   int f;  // number of free dims
5559   int c;  // number of conraction dims
5560   int offset_f;
5561   int offset_c;
5562   nvinfer1::Dims dims;
5563   std::vector<int> permute;
5564   std::vector<ITensorProxyPtr> size_tensors;
5565   Labels permuted_labels;
5566 };
5567 
GetDimsProd(nvinfer1::Dims dims,int offset,int n,int32_t * out)5568 Status GetDimsProd(nvinfer1::Dims dims, int offset, int n, int32_t* out) {
5569   size_t prod = std::accumulate(dims.d + offset, dims.d + offset + n, size_t(1),
5570                                 std::multiplies<size_t>());
5571   if (prod > std::numeric_limits<int32_t>::max()) {
5572     return errors::Internal("Matrix too large");
5573   } else {
5574     *out = prod;
5575   }
5576   return Status::OK();
5577 }
5578 
GetDimsProdDynamic(OpConverterParams * params,std::vector<ITensorProxyPtr>::const_iterator begin,std::vector<ITensorProxyPtr>::const_iterator end,ITensorProxyPtr * out)5579 Status GetDimsProdDynamic(OpConverterParams* params,
5580                           std::vector<ITensorProxyPtr>::const_iterator begin,
5581                           std::vector<ITensorProxyPtr>::const_iterator end,
5582                           ITensorProxyPtr* out) {
5583   *out = *begin;
5584   begin++;
5585   while (begin != end) {
5586     nvinfer1::IElementWiseLayer* layer =
5587         params->converter->network()->addElementWise(
5588             *(*out)->trt_tensor(), *(*begin)->trt_tensor(),
5589             nvinfer1::ElementWiseOperation::kPROD);
5590     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5591     *out = layer->getOutput(0);
5592     begin++;
5593   }
5594   return Status::OK();
5595 }
5596 
ConcatenateShape(OpConverterParams * params,const std::vector<ITensorProxyPtr> size_tensors,ITensorProxyPtr * new_shape)5597 Status ConcatenateShape(OpConverterParams* params,
5598                         const std::vector<ITensorProxyPtr> size_tensors,
5599                         ITensorProxyPtr* new_shape) {
5600   std::vector<nvinfer1::ITensor*> trt_size_tensors;
5601   for (const auto& t : size_tensors) {
5602     trt_size_tensors.push_back(t->trt_tensor());
5603   }
5604   nvinfer1::IConcatenationLayer* layer =
5605       params->converter->network()->addConcatenation(
5606           static_cast<nvinfer1::ITensor* const*>(trt_size_tensors.data()),
5607           size_tensors.size());
5608   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5609   layer->setAxis(0);
5610   *new_shape = layer->getOutput(0);
5611   return Status::OK();
5612 }
5613 
5614 // Reshapes operand so that the free dimensions are combined into a single dim,
5615 // and the contract dimensions are combined into another single dim.
GetEinsumNewDynamicShape(OpConverterParams * params,const EinsumDescriptor & desc,ITensorProxyPtr * new_shape)5616 Status GetEinsumNewDynamicShape(OpConverterParams* params,
5617                                 const EinsumDescriptor& desc,
5618                                 ITensorProxyPtr* new_shape) {
5619   std::vector<ITensorProxyPtr> size(desc.size_tensors.begin(),
5620                                     desc.size_tensors.begin() + desc.b + 2);
5621 
5622   int idx_f = desc.layout == EinsumLayout::BFC ? desc.b : desc.b + 1;
5623   int idx_c = desc.layout == EinsumLayout::BFC ? desc.b + 1 : desc.b;
5624 
5625   TF_RETURN_IF_ERROR(GetDimsProdDynamic(
5626       params, desc.size_tensors.begin() + desc.offset_f,
5627       desc.size_tensors.begin() + desc.offset_f + desc.f, &size[idx_f]));
5628 
5629   TF_RETURN_IF_ERROR(GetDimsProdDynamic(
5630       params, desc.size_tensors.begin() + desc.offset_c,
5631       desc.size_tensors.begin() + desc.offset_c + desc.c, &size[idx_c]));
5632 
5633   TF_RETURN_IF_ERROR(ConcatenateShape(params, size, new_shape));
5634   return Status::OK();
5635 }
5636 
5637 // Reshapes operand so that the free dimensions are combined into a single dim,
5638 // and the contract dimensions are combined into another single dim.
GetEinsumNewStaticShape(const EinsumDescriptor & desc,nvinfer1::Dims * new_dims)5639 Status GetEinsumNewStaticShape(const EinsumDescriptor& desc,
5640                                nvinfer1::Dims* new_dims) {
5641   new_dims->nbDims = desc.b + 2;
5642   // Copy batch dims.
5643   std::copy(desc.dims.d, desc.dims.d + desc.b, new_dims->d);
5644   // Combine free dims and contract dims.
5645   int idx_f = desc.layout == EinsumLayout::BFC ? desc.b : desc.b + 1;
5646   int idx_c = desc.layout == EinsumLayout::BFC ? desc.b + 1 : desc.b;
5647   TF_RETURN_IF_ERROR(
5648       GetDimsProd(desc.dims, desc.offset_f, desc.f, new_dims->d + idx_f));
5649   TF_RETURN_IF_ERROR(
5650       GetDimsProd(desc.dims, desc.offset_c, desc.c, new_dims->d + idx_c));
5651   return Status::OK();
5652 }
5653 
5654 // Adds shuffle layer (if needed) to bring einsum operand to a matmul compatible
5655 // format.
ShuffleEinsumTensor(OpConverterParams * params,std::unique_ptr<TRT_TensorOrWeights> * operand,EinsumDescriptor * desc,int op_instance)5656 Status ShuffleEinsumTensor(OpConverterParams* params,
5657                            std::unique_ptr<TRT_TensorOrWeights>* operand,
5658                            EinsumDescriptor* desc, int op_instance) {
5659   if (params->validation_only) return Status::OK();
5660   TF_RETURN_IF_ERROR(desc->SetDynamicSize(params, **operand));
5661   bool need_reshape = (desc->f != 1 || desc->c != 1);
5662   bool need_transpose = !desc->permute.empty();
5663   if ((*operand)->is_weights()) {
5664     nvinfer1::Dims new_dims;
5665     TF_RETURN_IF_ERROR(GetEinsumNewStaticShape(*desc, &new_dims));
5666     if (!need_transpose) {
5667       TRT_ShapedWeights weights((*operand)->weights());
5668       TF_RETURN_IF_ERROR(weights.SetShape(new_dims));
5669       operand->reset(new TRT_TensorOrWeights(weights));
5670       return Status::OK();
5671     }
5672     // TODO(tfeher): Instead of creating a tensor that will be transposed,
5673     // transpose the weight itself. Keeping it weight could enable FC layer.
5674     ITensorProxyPtr tensor = params->converter->CreateConstantLayer(
5675         (*operand)->weights(), (*operand)->GetTrtDims());
5676     operand->reset(new TRT_TensorOrWeights(tensor));
5677   }
5678 
5679   if (!need_transpose && !need_reshape) return Status::OK();
5680   ITensorProxyPtr operand_tensor = (*operand)->tensor();
5681   TFTRT_RETURN_ERROR_IF_NULLPTR(operand_tensor, "Null tensor at Einsum");
5682   nvinfer1::IShuffleLayer* layer =
5683       params->converter->network()->addShuffle(*operand_tensor->trt_tensor());
5684 
5685   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5686   params->converter->SetLayerName(layer, params->node_def, "shuffle",
5687                                   /*op_instance=*/op_instance);
5688   // Set new shape.
5689   if (need_reshape) {
5690     if (desc->HasStaticShape()) {
5691       nvinfer1::Dims new_dims;
5692       TF_RETURN_IF_ERROR(GetEinsumNewStaticShape(*desc, &new_dims));
5693       layer->setReshapeDimensions(new_dims);
5694     } else {
5695       ITensorProxyPtr new_shape;
5696       TF_RETURN_IF_ERROR(GetEinsumNewDynamicShape(params, *desc, &new_shape));
5697       layer->setInput(1, *new_shape->trt_tensor());
5698     }
5699   }
5700 
5701   if (need_transpose) {
5702     layer->setFirstTranspose(desc->GetPermutation());
5703   }
5704   operand->reset(new TRT_TensorOrWeights(layer->getOutput(0)));
5705   return Status::OK();
5706 }
5707 
5708 // Combines output dims/labels by copying batch and free dims/labels from input
5709 // A, and concatenating free values from input B.
5710 template <typename InputIterator, typename OutputIterator>
AssembleOutput(InputIterator begin_a,InputIterator begin_b,const EinsumDescriptor & desc_a,const EinsumDescriptor & desc_b,OutputIterator out)5711 void AssembleOutput(InputIterator begin_a, InputIterator begin_b,
5712                     const EinsumDescriptor& desc_a,
5713                     const EinsumDescriptor& desc_b, OutputIterator out) {
5714   std::copy(begin_a, begin_a + desc_a.b, out);
5715   begin_a += desc_a.offset_f;
5716   std::copy(begin_a, begin_a + desc_a.f, out + desc_a.b);
5717   begin_b += desc_b.offset_f;
5718   std::copy(begin_b, begin_b + desc_b.f, out + desc_a.b + desc_a.f);
5719 }
5720 
5721 // Restores free dimensions and sets final index order. Consider C = A * B,
5722 // batched MatMul op, where A.shape = [B, x, k] and B.shape = [B, k, y]. Then
5723 // C.shape = [B, x, y]. Here B can denote multiple batch indices while x, y, k
5724 // are single indices. The original inputs to Einsum can have multiple free
5725 // indices. These were combined into a singe free dimension x and y, for example
5726 // x = f_a1 * f_a2 * f_a3, y = f_b1 * f_b2. This routine creates a shuffle layer
5727 // to expand x into and y the original free dims, e.g. C is reshaped to
5728 // [B, f_a1, f_a2, f_a3, f_b1, f_b2]. Finally, a permutation is applied to
5729 // transform the shape to the shape of the original Einsum output.
ShuffleEinsumOutput(OpConverterParams * params,EinsumDescriptor desc_a,EinsumDescriptor desc_b,const std::vector<int> & permutation,ITensorProxyPtr * output)5730 Status ShuffleEinsumOutput(OpConverterParams* params, EinsumDescriptor desc_a,
5731                            EinsumDescriptor desc_b,
5732                            const std::vector<int>& permutation,
5733                            ITensorProxyPtr* output) {
5734   if (permutation.empty() && (desc_a.f == 1 && desc_b.f == 1))
5735     return Status::OK();
5736 
5737   nvinfer1::IShuffleLayer* layer =
5738       params->converter->network()->addShuffle(*(*output)->trt_tensor());
5739   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
5740   params->converter->SetLayerName(layer, params->node_def, "shuffle",
5741                                   /*op_instance=*/2);
5742 
5743   int output_rank = desc_a.b + desc_a.f + desc_b.f;
5744   if (desc_a.f != 1 || desc_b.f != 1) {
5745     if (desc_a.HasStaticShape() && desc_b.HasStaticShape()) {
5746       nvinfer1::Dims dims_out = {output_rank, {}};
5747       AssembleOutput(desc_a.dims.d, desc_b.dims.d, desc_a, desc_b, dims_out.d);
5748       layer->setReshapeDimensions(dims_out);
5749     } else {
5750       std::vector<ITensorProxyPtr> size_tensors(output_rank);
5751       AssembleOutput(desc_a.size_tensors.begin(), desc_b.size_tensors.begin(),
5752                      desc_a, desc_b, size_tensors.begin());
5753       ITensorProxyPtr new_shape;
5754       TF_RETURN_IF_ERROR(ConcatenateShape(params, size_tensors, &new_shape));
5755       layer->setInput(1, *new_shape->trt_tensor());
5756     }
5757   }
5758 
5759   if (!permutation.empty()) {
5760     nvinfer1::Permutation p;
5761     std::copy(permutation.begin(), permutation.end(), p.order);
5762     layer->setSecondTranspose(p);
5763   }
5764   *output = layer->getOutput(0);
5765   return Status::OK();
5766 }
5767 
5768 // Prepares EinsumDescriptors after parsing the equation and determines the
5769 // final transpose.
ParseEquation(OpConverterParams * params,std::unique_ptr<TRT_TensorOrWeights> * input_a,std::unique_ptr<TRT_TensorOrWeights> * input_b,EinsumDescriptor * descriptor_a,EinsumDescriptor * descriptor_b,std::vector<int> * final_transpose)5770 Status ParseEquation(OpConverterParams* params,
5771                      std::unique_ptr<TRT_TensorOrWeights>* input_a,
5772                      std::unique_ptr<TRT_TensorOrWeights>* input_b,
5773                      EinsumDescriptor* descriptor_a,
5774                      EinsumDescriptor* descriptor_b,
5775                      std::vector<int>* final_transpose) {
5776   TFAttrs attrs(params->node_def);
5777   std::string equation = attrs.get<string>("equation");
5778   VLOG(2) << "Einsum equation " << equation;
5779 
5780   OperandLabels input_labels;
5781   Labels output_labels;
5782   std::vector<EinsumHelper::DimensionType> label_types;
5783   OperandLabelCounts input_label_counts;
5784   LabelCounts output_label_counts;
5785   absl::InlinedVector<bool, 2> input_has_ellipsis;
5786   bool output_has_ellipsis;
5787   TF_RETURN_IF_ERROR(EinsumHelper::ParseEquation(
5788       equation, &input_labels, &output_labels, &label_types,
5789       &input_label_counts, &output_label_counts, &input_has_ellipsis,
5790       &output_has_ellipsis));
5791 
5792   VLOG(2) << "Output has ellipsis: " << output_has_ellipsis;
5793 
5794   if (input_has_ellipsis[0] || input_has_ellipsis[1] || output_has_ellipsis) {
5795     // TODO(tfeher): Handle ellipsis like EinsumHelper::ProcessDimensions.
5796     // Note: ProcessDimensions would introduce kBroadcasting labels, which we
5797     // need to replace with kBatch before we call InitDescriptor.
5798     VLOG(2) << "Ellipsis not yet supported";
5799     return errors::Unimplemented("No conversion for einsum equation.");
5800   }
5801   if (absl::c_any_of(label_types, [](auto l) {
5802         return l == EinsumHelper::DimensionType::kReduce ||
5803                l == EinsumHelper::DimensionType::kBroadcasting;
5804       })) {
5805     VLOG(2) << "Einsum reductions not implemented";
5806     return errors::Unimplemented("No conversion for einsum equation.");
5807   }
5808 
5809   auto no_duplicated_labels = [](const LabelCounts& label_counts) {
5810     return absl::c_any_of(label_counts, [](int i) { return i > 1; });
5811   };
5812   if (no_duplicated_labels(input_label_counts[0]) ||
5813       no_duplicated_labels(input_label_counts[1]) ||
5814       no_duplicated_labels(output_label_counts)) {
5815     VLOG(2) << "Einsum invalid label count";
5816     return errors::Unimplemented("No conversion for einsum equation.");
5817   }
5818 
5819   if ((*input_a)->is_weights() && (*input_b)->is_tensor()) {
5820     // We prefer to use FC layer, needs A as tensor and B as weight.
5821     std::swap(*input_a, *input_b);
5822     std::swap(input_labels[0], input_labels[1]);
5823     std::swap(input_label_counts[0], input_label_counts[1]);
5824   }
5825 
5826   TF_RETURN_IF_ERROR(descriptor_a->InitDescriptor(
5827       **input_a, input_labels[0], label_types, EinsumLayout::BFC));
5828   TF_RETURN_IF_ERROR(
5829       descriptor_b->InitDescriptor(**input_b, input_labels[1], label_types,
5830                                    EinsumLayout::BCF, descriptor_a));
5831   // TODO(tfeher): Update the permutation in the descriptors to avoid final
5832   // transpose (if possible). Consider swapping the input if it eliminates
5833   // final transpose.
5834 
5835   // Get final transpose.
5836   Labels matmul_output_labels(descriptor_a->b + descriptor_a->f +
5837                               descriptor_b->f);
5838   AssembleOutput(descriptor_a->permuted_labels.begin(),
5839                  descriptor_b->permuted_labels.begin(), *descriptor_a,
5840                  *descriptor_b, matmul_output_labels.begin());
5841   TF_RETURN_IF_ERROR(FindIndices(output_labels.begin(), output_labels.end(),
5842                                  matmul_output_labels.begin(),
5843                                  matmul_output_labels.end(), final_transpose));
5844   // Clear identity transpose.
5845   bool identity_transpose = true;
5846   for (int i = 0; i < final_transpose->size() && identity_transpose; i++) {
5847     identity_transpose &= final_transpose->at(i) == i;
5848   }
5849   if (identity_transpose) {
5850     final_transpose->clear();
5851   }
5852   return Status::OK();
5853 }
5854 
ConvertEinsum(OpConverterParams * params)5855 Status ConvertEinsum(OpConverterParams* params) {
5856   const auto& inputs = params->inputs;
5857   const auto& node_def = params->node_def;
5858   if (params->use_implicit_batch) {
5859     return errors::Unimplemented(
5860         "Einsum converter requires dynamic shape mode");
5861   }
5862 
5863   if (inputs.size() != 2) {
5864     VLOG(2) << "Einsum converter supports two operands at " << node_def.name()
5865             << " got " << inputs.size();
5866     return errors::Unimplemented("No conversion for einsum equation.");
5867   }
5868   TF_RETURN_IF_ERROR(
5869       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5870 
5871   auto input_a = std::make_unique<TRT_TensorOrWeights>(inputs.at(0));
5872   auto input_b = std::make_unique<TRT_TensorOrWeights>(inputs.at(1));
5873   EinsumDescriptor descriptor_a;
5874   EinsumDescriptor descriptor_b;
5875   std::vector<int> final_transpose;
5876   TF_RETURN_IF_ERROR(ParseEquation(params, &input_a, &input_b, &descriptor_a,
5877                                    &descriptor_b, &final_transpose));
5878 
5879   TF_RETURN_IF_ERROR(ShuffleEinsumTensor(params, &input_a, &descriptor_a,
5880                                          /*op_instance=*/0));
5881   TF_RETURN_IF_ERROR(ShuffleEinsumTensor(params, &input_b, &descriptor_b,
5882                                          /*op_instance=*/1));
5883   if (params->validation_only) return Status::OK();
5884 
5885   StatusOr<ITensorProxyPtr> result = ConvertMatMulImpl(
5886       params, *input_a, *input_b, descriptor_a.layout == EinsumLayout::BCF,
5887       descriptor_b.layout == EinsumLayout::BFC);
5888   TF_RETURN_IF_ERROR(result.status());
5889   ITensorProxyPtr output = result.ValueOrDie();
5890 
5891   TF_RETURN_IF_ERROR(ShuffleEinsumOutput(params, descriptor_a, descriptor_b,
5892                                          final_transpose, &output));
5893   params->outputs->push_back(TRT_TensorOrWeights(output));
5894   return Status::OK();
5895 }
5896 #endif  // IS_TRT_VERSION_GE(7, 1, 3, 0)
5897 
ConvertSoftmax(OpConverterParams * params)5898 Status ConvertSoftmax(OpConverterParams* params) {
5899   const auto& inputs = params->inputs;
5900   const auto& node_def = params->node_def;
5901   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}}));
5902   TF_RETURN_IF_ERROR(
5903       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5904   ITensorProxyPtr tensor = inputs.at(0).tensor();
5905 
5906   const int num_trt_dims = tensor->getDimensions().nbDims;
5907   if (num_trt_dims == 0 && params->use_implicit_batch) {
5908     return errors::InvalidArgument(
5909         "TensorRT Softmax cannot apply on batch dimension, at",
5910         node_def.name());
5911   }
5912   if (params->validation_only) return Status::OK();
5913 
5914   nvinfer1::ISoftMaxLayer* layer =
5915       params->converter->network()->addSoftMax(*tensor->trt_tensor());
5916   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5917   params->converter->SetLayerName(layer, node_def);
5918   // Tensorflow SoftMax assumes applying softmax on the last dimension.
5919   layer->setAxes(1 << (num_trt_dims - 1));
5920 
5921   ITensorProxyPtr output_tensor = layer->getOutput(0);
5922   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5923   return Status::OK();
5924 }
5925 
ConvertArgMinMax(OpConverterParams * params)5926 Status ConvertArgMinMax(OpConverterParams* params) {
5927   const auto& inputs = params->inputs;
5928   const auto& node_def = params->node_def;
5929   TF_RETURN_IF_ERROR(
5930       CheckInputsWeights(*params, {{"input", false}, {"dimension", true}}));
5931   TF_RETURN_IF_ERROR(
5932       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5933   // INT64 outputs are not supported by TRT.
5934   TFAttrs attrs(node_def);
5935   DataType output_dtype = attrs.get<DataType>("output_type");
5936   if (output_dtype != DataType::DT_INT32) {
5937     return errors::Unimplemented("Output type ", DataTypeString(output_dtype),
5938                                  " is not supported, at ", node_def.name());
5939   }
5940   int tf_axis = inputs.at(1).weights().GetSpan<int>()[0];
5941   int trt_axis;
5942   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
5943   TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
5944                                  params->use_implicit_batch, &trt_axis));
5945   nvinfer1::TopKOperation topk_op;
5946   if (node_def.op() == "ArgMin") {
5947     topk_op = nvinfer1::TopKOperation::kMIN;
5948   } else if (node_def.op() == "ArgMax") {
5949     topk_op = nvinfer1::TopKOperation::kMAX;
5950   } else {
5951     return errors::InvalidArgument("Unsupported ArgMin/Max operation");
5952   }
5953 
5954 #if !IS_TRT_VERSION_GE(7, 0, 0, 11)
5955   const nvinfer1::Dims trt_dims = params->inputs.at(0).GetTrtDims();
5956   if (trt_dims.nbDims >= 4) {
5957     string trt_dim_str = DebugString(trt_dims);
5958 
5959     return errors::Unimplemented(node_def.op(), "op is not able to support",
5960                                  " tensors with 4+ dimensions (excluding batch",
5961                                  " size). Received: ", trt_dim_str);
5962   }
5963 #endif
5964 
5965   if (params->validation_only) return Status::OK();
5966 
5967   // Use TopK with k = 1. Only indices output is needed (output 1).
5968   const uint32_t reduce_axes = 1 << trt_axis;
5969   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
5970       *inputs.at(0).tensor()->trt_tensor(), topk_op, 1, reduce_axes);
5971   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
5972   params->converter->SetLayerName(layer, node_def, "topk");
5973   ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
5974 
5975   // Squeeze on axis.
5976   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
5977   input_dims[trt_axis] = 0;
5978   ITensorProxyPtr output_tensor = nullptr;
5979   TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
5980       output_indices_tensor, &input_dims, params, &output_tensor));
5981   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
5982 
5983   return Status::OK();
5984 }
5985 
ConvertTopK(OpConverterParams * params)5986 Status ConvertTopK(OpConverterParams* params) {
5987   const auto& inputs = params->inputs;
5988   const auto& node_def = params->node_def;
5989   TF_RETURN_IF_ERROR(
5990       CheckInputsWeights(*params, {{"input", false}, {"k", true}}));
5991   TF_RETURN_IF_ERROR(
5992       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
5993   TFAttrs attrs(node_def);
5994   const bool sorted = attrs.get<bool>("sorted");
5995   if (!sorted) {
5996     // TensorRT only supports sorted output. Although TensorFlow API
5997     // doesn't specify the order of output elements in case sorted=false,
5998     // but it's safer to not convert because the output of TensorRT might
5999     // be different with TensorFlow which can cause confusion.
6000     return errors::InvalidArgument("Only sorted=True is supported, at",
6001                                    node_def.name());
6002   }
6003 
6004   ITensorProxyPtr tensor = inputs.at(0).tensor();
6005   const int num_dims = tensor->getDimensions().nbDims;
6006   if (num_dims == 0) {
6007     return errors::InvalidArgument(
6008         "TensorRT TopK cannot apply on batch dimension, at", node_def.name());
6009   }
6010 
6011   TRT_ShapedWeights k_w = inputs.at(1).weights();
6012   if (k_w.count() != 1) {
6013     return errors::InvalidArgument("k value of TopK should be a scalar, at",
6014                                    node_def.name());
6015   }
6016   // Note that ITopKLayer always have sorted outputs, so we don't need to handle
6017   // the 'sorted' attribute of the node.
6018   if (params->validation_only) return Status::OK();
6019 
6020   const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX;
6021   const int k = *(static_cast<int*>(k_w.GetValues()));
6022   const uint32_t reduce_axes = 1 << (num_dims - 1);
6023   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
6024       *tensor->trt_tensor(), op, k, reduce_axes);
6025   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6026   params->converter->SetLayerName(layer, node_def);
6027 
6028   ITensorProxyPtr output_value_tensor = layer->getOutput(0);
6029   ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
6030   params->outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
6031   params->outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
6032   return Status::OK();
6033 }
6034 
6035 StatusOr<std::pair<ITensorProxyPtr, ITensorProxyPtr>>
CalcDepthSpaceDynamicShape(OpConverterParams * params,int block_size,string data_format)6036 CalcDepthSpaceDynamicShape(OpConverterParams* params, int block_size,
6037                            string data_format) {
6038   // Instead we use a shape layer and shape arithmetic to calculate the reshape
6039   // dimensions.
6040   const auto& inputs = params->inputs;
6041   const auto& node_def = params->node_def;
6042 
6043   const int channels_axis = data_format == "NCHW" ? 1 : 3;
6044   const int h_axis = data_format == "NCHW" ? 2 : 1;
6045   const int w_axis = data_format == "NCHW" ? 3 : 2;
6046 
6047   // Get shapes.
6048   ITensorProxyPtr shape = params->converter->network()
6049                               ->addShape(*inputs.at(0).tensor()->trt_tensor())
6050                               ->getOutput(0);
6051   ITensorProxyPtr batch_size =
6052       params->converter->network()
6053           ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}})
6054           ->getOutput(0);
6055   ITensorProxyPtr num_channels =
6056       params->converter->network()
6057           ->addSlice(*shape->trt_tensor(), {1, {channels_axis}}, {1, {1}},
6058                      {1, {1}})
6059           ->getOutput(0);
6060   ITensorProxyPtr h =
6061       params->converter->network()
6062           ->addSlice(*shape->trt_tensor(), {1, {h_axis}}, {1, {1}}, {1, {1}})
6063           ->getOutput(0);
6064   ITensorProxyPtr w =
6065       params->converter->network()
6066           ->addSlice(*shape->trt_tensor(), {1, {w_axis}}, {1, {1}}, {1, {1}})
6067           ->getOutput(0);
6068   ITensorProxyPtr r;
6069   TF_RETURN_IF_ERROR(CreateScalarConstant(params, block_size, &r));
6070   ITensorProxyPtr r_squared;
6071   TF_RETURN_IF_ERROR(
6072       CreateScalarConstant(params, block_size * block_size, &r_squared));
6073   // Get shuffle parameters.
6074   std::vector<ITensorProxyPtr> first_shuffle_tensors(6, nullptr);
6075   std::vector<ITensorProxyPtr> second_shuffle_tensors(4, nullptr);
6076   if (node_def.op() == "DepthToSpace") {
6077     // First Reshape [N, C, H, W] - > [N, r, r, C/(r*r), H, W].
6078     first_shuffle_tensors[0] = batch_size;
6079     first_shuffle_tensors[1] = r;
6080     first_shuffle_tensors[2] = r;
6081     first_shuffle_tensors[3] =
6082         params->converter->network()
6083             ->addElementWise(*num_channels->trt_tensor(),
6084                              *r_squared->trt_tensor(),
6085                              nvinfer1::ElementWiseOperation::kDIV)
6086             ->getOutput(0);
6087     first_shuffle_tensors[4] = h;
6088     first_shuffle_tensors[5] = w;
6089     // Second Reshape [N, C/(r*r), H, r, W, r] -> [N, C/(r*r), H * r, W * r].
6090     second_shuffle_tensors[0] = batch_size;
6091     second_shuffle_tensors[1] =
6092         params->converter->network()
6093             ->addElementWise(*num_channels->trt_tensor(),
6094                              *r_squared->trt_tensor(),
6095                              nvinfer1::ElementWiseOperation::kDIV)
6096             ->getOutput(0);
6097     second_shuffle_tensors[2] =
6098         params->converter->network()
6099             ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6100                              nvinfer1::ElementWiseOperation::kPROD)
6101             ->getOutput(0);
6102     second_shuffle_tensors[3] =
6103         params->converter->network()
6104             ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6105                              nvinfer1::ElementWiseOperation::kPROD)
6106             ->getOutput(0);
6107   } else if (node_def.op() == "SpaceToDepth") {
6108     // First Reshape [N, C, H, W] -> [N, C, H/r, r, W/r, r].
6109     first_shuffle_tensors[0] = batch_size;
6110     first_shuffle_tensors[1] = num_channels;
6111     first_shuffle_tensors[2] =
6112         params->converter->network()
6113             ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6114                              nvinfer1::ElementWiseOperation::kDIV)
6115             ->getOutput(0);
6116     first_shuffle_tensors[3] = r;
6117     first_shuffle_tensors[4] =
6118         params->converter->network()
6119             ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6120                              nvinfer1::ElementWiseOperation::kDIV)
6121             ->getOutput(0);
6122     first_shuffle_tensors[5] = r;
6123 
6124     // Second Reshape  [N, r, r, C, H/r, W/r] -> [N, C*r*r, H/r, W/r].
6125     second_shuffle_tensors[0] = batch_size;
6126     second_shuffle_tensors[1] =
6127         params->converter->network()
6128             ->addElementWise(*num_channels->trt_tensor(),
6129                              *r_squared->trt_tensor(),
6130                              nvinfer1::ElementWiseOperation::kPROD)
6131             ->getOutput(0);
6132     second_shuffle_tensors[2] =
6133         params->converter->network()
6134             ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
6135                              nvinfer1::ElementWiseOperation::kDIV)
6136             ->getOutput(0);
6137     second_shuffle_tensors[3] =
6138         params->converter->network()
6139             ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
6140                              nvinfer1::ElementWiseOperation::kDIV)
6141             ->getOutput(0);
6142   }
6143 
6144   StatusOr<ITensorProxyPtr> result =
6145       ConcatenateTensors(params, first_shuffle_tensors, 0);
6146   TF_RETURN_IF_ERROR(result.status());
6147   ITensorProxyPtr first_shuffle_shape = result.ValueOrDie();
6148 
6149   result = ConcatenateTensors(params, second_shuffle_tensors, 1);
6150   TF_RETURN_IF_ERROR(result.status());
6151   ITensorProxyPtr second_shuffle_shape = result.ValueOrDie();
6152 
6153   return std::make_pair(first_shuffle_shape, second_shuffle_shape);
6154 }
6155 
ConvertDepthSpaceShuffle(OpConverterParams * params)6156 Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
6157   const auto& inputs = params->inputs;
6158   const auto& node_def = params->node_def;
6159   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
6160   TF_RETURN_IF_ERROR(AllowDataTypes(
6161       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
6162   TFAttrs attrs(node_def);
6163   const int block_size = attrs.get<int64>("block_size");
6164   if (block_size < 2) {
6165     return errors::InvalidArgument("Block size must be 2 or greater, at ",
6166                                    node_def.name());
6167   }
6168   const string data_format = attrs.get<string>("data_format");
6169   if (data_format != "NCHW" && data_format != "NHWC") {
6170     return errors::Unimplemented("Data format ", data_format,
6171                                  " is not supported, at ", node_def.name());
6172   }
6173   int idx_offset = params->use_implicit_batch ? 0 : 1;
6174   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
6175   const int required_rank = 3 + idx_offset;
6176   if (dims.nbDims != required_rank) {
6177     return errors::InvalidArgument("The input to ", node_def.op(),
6178                                    " must be rank 4, at ", node_def.name());
6179   }
6180   const int num_channels =
6181       data_format == "NCHW" ? dims.d[0 + idx_offset] : dims.d[2 + idx_offset];
6182   const int h =
6183       data_format == "NCHW" ? dims.d[1 + idx_offset] : dims.d[0 + idx_offset];
6184   const int w =
6185       data_format == "NCHW" ? dims.d[2 + idx_offset] : dims.d[1 + idx_offset];
6186   // Get shuffle parameters.
6187   nvinfer1::Dims first_shuffle_shape;
6188   nvinfer1::Permutation transpose_perm;
6189   nvinfer1::Dims second_shuffle_shape;
6190 
6191   // We define all the shuffle and transpose dimensions assuming implicit batch
6192   // mode. Afterwards we will update them to explicit batch mode if needed.
6193   // Additionally, an NCHW layout is assumed, and this assumption is corrected
6194   // afterwards with an initial transpose op. TODO(tfeher): Get rid of the
6195   // layout_transpose ops by defining shuffle shape specifically for NCHW and
6196   // NHCW.
6197   if (node_def.op() == "DepthToSpace") {
6198     if (num_channels != -1 && num_channels % (block_size * block_size) != 0) {
6199       return errors::InvalidArgument(
6200           "Number of channels must be divisible by block_size*block_size, at ",
6201           node_def.name());
6202     }
6203     // First Reshape [C, H, W] - > [r, r, C/(r*r), H, W]
6204     first_shuffle_shape = {
6205         /*nbDims=*/5,
6206         /*d=*/{block_size, block_size, num_channels / (block_size * block_size),
6207                h, w}};
6208     // Transpose [r, r, C/(r*r), H, W] -> [C/(r*r), H, r, W, r]
6209     transpose_perm = {2, 3, 0, 4, 1};
6210     // Second Reshape [C/(r*r), H, r, W, r] -> [C/(r*r), H * r, W * r]
6211     second_shuffle_shape =
6212         nvinfer1::Dims3(num_channels / (block_size * block_size),
6213                         h * block_size, w * block_size);
6214   } else {
6215     if (node_def.op() != "SpaceToDepth")
6216       return errors::InvalidArgument("Incorrect op type ", node_def.op());
6217     if ((h != -1 && h % block_size != 0) || (w != -1 && w % block_size != 0)) {
6218       return errors::InvalidArgument(
6219           "Width and height must be divisible by block_size, at ",
6220           node_def.name());
6221     }
6222     // First Reshape [C, H, W] -> [C, H/r, r, W/r, r]
6223     first_shuffle_shape = {/*nbDims=*/5,
6224                            /*d=*/{num_channels, h / block_size, block_size,
6225                                   w / block_size, block_size}};
6226     // Transpose [C, H/r, r, W/r, r] -> [r, r, C, H/r, W/r]
6227     transpose_perm = {2, 4, 0, 1, 3};
6228     // Second Reshape  [r, r, C, H/r, W/r] -> [C*r*r, H/r, W/r]
6229     second_shuffle_shape = nvinfer1::Dims3(
6230         num_channels * block_size * block_size, h / block_size, w / block_size);
6231   }
6232   if (params->validation_only) return Status::OK();
6233 
6234   nvinfer1::IShuffleLayer* first_shuffle =
6235       params->converter->network()->addShuffle(
6236           *inputs.at(0).tensor()->trt_tensor());
6237   TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name());
6238   params->converter->SetLayerName(first_shuffle, node_def, "shuffle",
6239                                   /*op_instance=*/0);
6240 
6241   ITensorProxyPtr second_shuffle_shape_tensor;
6242 
6243   if (HasStaticShape(inputs.at(0).GetTrtDims())) {
6244     // Adjust a reshape constructed at implicit batch mode for explicit batch
6245     // mode. In particular, we need to insert the batch dimension size to the
6246     // beginning of all the dimension sizes. Example: reshape {20,10,30} for
6247     // implicit batch mode becomes reshape {N,20,10,30} for explicit batch mode.
6248     auto adjust_reshape = [](int N, nvinfer1::Dims dims,
6249                              bool use_implicit_batch) {
6250       if (use_implicit_batch) return dims;
6251       for (int i = dims.nbDims; i > 0; i--) {
6252         dims.d[i] = dims.d[i - 1];
6253       }
6254       dims.d[0] = N;
6255       dims.nbDims++;
6256       return dims;
6257     };
6258 
6259     first_shuffle_shape = adjust_reshape(dims.d[0], first_shuffle_shape,
6260                                          params->use_implicit_batch);
6261     second_shuffle_shape = adjust_reshape(dims.d[0], second_shuffle_shape,
6262                                           params->use_implicit_batch);
6263 
6264     first_shuffle->setReshapeDimensions(first_shuffle_shape);
6265   } else {
6266     StatusOr<std::pair<ITensorProxyPtr, ITensorProxyPtr>> result =
6267         CalcDepthSpaceDynamicShape(params, block_size, data_format);
6268     TF_RETURN_IF_ERROR(result.status());
6269     first_shuffle->setInput(1, *result.ValueOrDie().first->trt_tensor());
6270     second_shuffle_shape_tensor = result.ValueOrDie().second;
6271   }
6272 
6273   // Adjust a transpose constructed assuming implicit batch mode for explicit
6274   // batch mode. In particular, we need to add the batch dimension to d0 and
6275   // add 1 to all the dimension id in the transpose. Example: permutation
6276   // for implicit batch mode becomes permutation {0,3,2,1} for explicit batch
6277   // mode.
6278   auto adjust_perm = [](int n, nvinfer1::Permutation perm,
6279                         bool use_implicit_batch) {
6280     if (use_implicit_batch) return perm;
6281     for (int i = n; i > 0; i--) {
6282       perm.order[i] = perm.order[i - 1] + 1;
6283     }
6284     perm.order[0] = 0;
6285     return perm;
6286   };
6287   transpose_perm = adjust_perm(5, transpose_perm, params->use_implicit_batch);
6288 
6289   if (data_format == "NHWC") {
6290     nvinfer1::Permutation layout_transpose =
6291         adjust_perm(3, {2, 0, 1}, params->use_implicit_batch);
6292     first_shuffle->setFirstTranspose(layout_transpose);
6293   }
6294   first_shuffle->setSecondTranspose(transpose_perm);
6295 
6296   nvinfer1::IShuffleLayer* second_shuffle =
6297       params->converter->network()->addShuffle(*first_shuffle->getOutput(0));
6298   TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name());
6299   params->converter->SetLayerName(second_shuffle, node_def, "shuffle",
6300                                   /*op_instance=*/1);
6301 
6302   if (HasStaticShape(inputs.at(0).GetTrtDims())) {
6303     second_shuffle->setReshapeDimensions(second_shuffle_shape);
6304   } else {
6305     second_shuffle->setInput(1, *second_shuffle_shape_tensor->trt_tensor());
6306   }
6307   if (data_format == "NHWC") {
6308     nvinfer1::Permutation layout_transpose =
6309         adjust_perm(3, {1, 2, 0}, params->use_implicit_batch);
6310     second_shuffle->setSecondTranspose(layout_transpose);
6311   }
6312 
6313   params->outputs->push_back(TRT_TensorOrWeights(second_shuffle->getOutput(0)));
6314   return Status::OK();
6315 }
6316 
ConvertSquaredDifference(OpConverterParams * params)6317 Status ConvertSquaredDifference(OpConverterParams* params) {
6318   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}}));
6319   TF_RETURN_IF_ERROR(
6320       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6321   const auto& inputs = params->inputs;
6322   const auto& node_def = params->node_def;
6323   // Broadcast inputs.
6324   nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
6325   TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
6326       inputs.at(0), inputs.at(1), /*check_feasibility=*/true,
6327       params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
6328   ITensorProxyPtr tensor_l = nullptr;
6329   ITensorProxyPtr tensor_r = nullptr;
6330   TF_RETURN_IF_ERROR(
6331       PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l,
6332                             params->validation_only, &tensor_l, node_def));
6333   TF_RETURN_IF_ERROR(
6334       PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r,
6335                             params->validation_only, &tensor_r, node_def));
6336   if (params->validation_only) return Status::OK();
6337 
6338   // Subtract x - y.
6339   nvinfer1::IElementWiseLayer* sub =
6340       params->converter->network()->addElementWise(
6341           *tensor_l->trt_tensor(), *tensor_r->trt_tensor(),
6342           nvinfer1::ElementWiseOperation::kSUB);
6343   TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
6344   params->converter->SetLayerName(sub, node_def, "sub");
6345 
6346   // Multiply (x - y) * (x - y).
6347   nvinfer1::IElementWiseLayer* mul =
6348       params->converter->network()->addElementWise(
6349           *sub->getOutput(0), *sub->getOutput(0),
6350           nvinfer1::ElementWiseOperation::kPROD);
6351   TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
6352   params->converter->SetLayerName(mul, node_def, "mul");
6353 
6354   params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
6355   return Status::OK();
6356 }
6357 
6358 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6359 
AllowNmsTopkOverride()6360 bool AllowNmsTopkOverride() {
6361   static bool result = [] {
6362     bool value;
6363     Status status = ReadBoolFromEnvVar("TF_TRT_ALLOW_NMS_TOPK_OVERRIDE",
6364                                        /*default_value=*/false, &value);
6365     if (!status.ok()) {
6366       LOG(ERROR) << status;
6367     }
6368     return value;
6369   }();
6370   return result;
6371 }
6372 
ConvertCombinedNMS(OpConverterParams * params)6373 Status ConvertCombinedNMS(OpConverterParams* params) {
6374   TF_RETURN_IF_ERROR(
6375       CheckInputsWeights(*params, {{"boxes", false},
6376                                    {"scores", false},
6377                                    {"max_output_size_per_class", true},
6378                                    {"max_total_size", true},
6379                                    {"iou_threshold", true},
6380                                    {"score_threshold", true}}));
6381   const auto& inputs = params->inputs;
6382   const auto& node_def = params->node_def;
6383 
6384   ITensorProxyPtr boxes_tensor = inputs.at(0).tensor();
6385   ITensorProxyPtr scores_tensor = inputs.at(1).tensor();
6386   TRT_ShapedWeights output_size_per_class = inputs.at(2).weights();
6387   TRT_ShapedWeights total_size = inputs.at(3).weights();
6388   TRT_ShapedWeights iou_threshold = inputs.at(4).weights();
6389   TRT_ShapedWeights score_threshold = inputs.at(5).weights();
6390 
6391   // Validate tensors and weights (also set some of the needed plugin fields)
6392   const auto boxes_dims = boxes_tensor->getDimensions();
6393   const auto scores_dims = scores_tensor->getDimensions();
6394   if (!params->use_implicit_batch &&
6395       (!HasStaticShape(boxes_dims) || !HasStaticShape(scores_dims))) {
6396     return errors::Unimplemented(
6397         "TensorRT BatchedNMS Plugin requires input with static shape");
6398   }
6399   const int offset = params->use_implicit_batch ? 0 : 1;
6400   if (boxes_dims.nbDims != 3 + offset) {
6401     return errors::InvalidArgument(
6402         "TensorRT BatchedNMS Plugin input boxes must be 4-D including batch ",
6403         node_def.name());
6404   }
6405   const int class_idx = 1 + offset;
6406   const int num_classes = scores_dims.d[class_idx];
6407   const int num_boxes = boxes_dims.d[0 + offset];
6408   bool box_check =
6409       boxes_dims.d[class_idx] == 1 || boxes_dims.d[class_idx] == num_classes;
6410   if (!box_check) {
6411     return errors::InvalidArgument(
6412         "TensorRT BatchedNMS Plugin third dimension of boxes must be either 1 "
6413         "or num_classes ",
6414         node_def.name());
6415   }
6416 
6417   if (output_size_per_class.count() != 1) {
6418     return errors::InvalidArgument(
6419         "TensorRT BatchedNMS Plugin max_output_size_per_class must be scalar ",
6420         node_def.name());
6421   }
6422   int max_size_per_class =
6423       *(static_cast<int*>(output_size_per_class.GetValues()));
6424   if (max_size_per_class <= 0) {
6425     return errors::InvalidArgument(
6426         "TensorRT BatchedNMS Plugin max_output_size_per_class should be > 0",
6427         node_def.name());
6428   }
6429   if (total_size.count() != 1) {
6430     return errors::InvalidArgument(
6431         "TensorRT BatchedNMS Plugin max_total_size must be scalar ",
6432         node_def.name());
6433   }
6434   int max_total_size = *(static_cast<int*>(total_size.GetValues()));
6435   if (max_total_size <= 0) {
6436     return errors::InvalidArgument(
6437         "TensorRT BatchedNMS Plugin max_total_size should be > 0",
6438         node_def.name());
6439   }
6440   if (iou_threshold.count() != 1) {
6441     return errors::InvalidArgument(
6442         "TensorRT BatchedNMS Plugin iou_threshold must be scalar ",
6443         node_def.name());
6444   }
6445   float iou_thresh = *(static_cast<float*>(iou_threshold.GetValues()));
6446   if (iou_thresh < 0.0 || iou_thresh > 1.0) {
6447     return errors::InvalidArgument(
6448         "TensorRT BatchedNMS Plugin iou_threshold must be in [0, 1]",
6449         node_def.name());
6450   }
6451   if (score_threshold.count() != 1) {
6452     return errors::InvalidArgument(
6453         "TensorRT BatchedNMS Plugin score_threshold must be scalar ",
6454         node_def.name());
6455   }
6456 
6457   // TRT op is_normalized=False treats input corrdinates as pixels and
6458   // calculates width/height as (max - min + 1).
6459   //
6460   // TF op CombinedNonMaxSuppression doesn't care about the normalization and
6461   // calculates width/height  as (max-min).
6462   //
6463   // We set is_normalized = true to be consistent with TF IOU calculaton.
6464   const bool is_normalized = true;
6465 
6466   TFAttrs attrs(node_def);
6467   bool share_location = (boxes_dims.d[class_idx] == 1);
6468   const bool pad_per_class = attrs.get<bool>("pad_per_class");
6469   const bool clip_boxes = attrs.get<bool>("clip_boxes");
6470   int keep_top_k = 0;
6471   if (pad_per_class) {
6472     keep_top_k = std::min(max_size_per_class * num_classes, max_total_size);
6473   } else {
6474     keep_top_k = max_total_size;
6475   }
6476 
6477   // According to the batchedNMS plugin description we need to set top_k so that
6478   // keep_top_k <= top_k
6479   // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
6480   // Before the NMS step, TRT selects top_k candidate from each class and
6481   // discards the rest. The NMS step is performed only among the top_k
6482   // candidates. To be strictly compatible with the TF op, we need that top_k is
6483   // greater equal to num_boxes.
6484   int top_k = std::max(num_boxes, keep_top_k);
6485   // TRT has a limitation: top_k <=4096.
6486   if (top_k > 4096) {
6487     if (AllowNmsTopkOverride()) {
6488       top_k = 4096;
6489       keep_top_k = std::min(top_k, keep_top_k);
6490     } else {
6491       return errors::InvalidArgument(
6492           "TRT NMS plugin allow top_k<=4096, where top_k = max(num_boxes, "
6493           "max_total_size). You can override this by setting "
6494           "TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 environment variable, but this can "
6495           "result in a loss of accuracy.");
6496     }
6497   }
6498 
6499   if (params->validation_only) return Status::OK();
6500   float score_thresh = *(static_cast<float*>(score_threshold.GetValues()));
6501   const int background_id = -1;
6502   nvinfer1::PluginField fields[9] = {
6503       nvinfer1::PluginField{"shareLocation", &share_location,
6504                             nvinfer1::PluginFieldType::kINT32, 1},
6505       nvinfer1::PluginField{"backgroundLabelId", &background_id,
6506                             nvinfer1::PluginFieldType::kINT32, 1},
6507       nvinfer1::PluginField{"numClasses", &num_classes,
6508                             nvinfer1::PluginFieldType::kINT32, 1},
6509       nvinfer1::PluginField{"topK", &top_k, nvinfer1::PluginFieldType::kINT32,
6510                             1},
6511       nvinfer1::PluginField{"keepTopK", &keep_top_k,
6512                             nvinfer1::PluginFieldType::kINT32, 1},
6513       nvinfer1::PluginField{"scoreThreshold", &score_thresh,
6514                             nvinfer1::PluginFieldType::kFLOAT32, 1},
6515       nvinfer1::PluginField{"iouThreshold", &iou_thresh,
6516                             nvinfer1::PluginFieldType::kFLOAT32, 1},
6517       nvinfer1::PluginField{"isNormalized", &is_normalized,
6518                             nvinfer1::PluginFieldType::kINT32, 1},
6519       nvinfer1::PluginField{"clipBoxes", &clip_boxes,
6520                             nvinfer1::PluginFieldType::kINT32, 1}};
6521   nvinfer1::PluginFieldCollection fc{9, fields};
6522 
6523   // Get plugin creator
6524   auto creator =
6525       getPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1", "");
6526   TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_def.name());
6527 
6528   // Create plugin
6529   TrtUniquePtrType<nvinfer1::IPluginV2> plugin(
6530       creator->createPlugin(node_def.name().c_str(), &fc));
6531   TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_def.name());
6532 
6533   // Set plugin inputs
6534   std::vector<nvinfer1::ITensor*> trt_plugin_inputs;
6535   trt_plugin_inputs.push_back(boxes_tensor->trt_tensor());
6536   trt_plugin_inputs.push_back(scores_tensor->trt_tensor());
6537 
6538   // Add plugin to network
6539   nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2(
6540       &trt_plugin_inputs[0], static_cast<int>(trt_plugin_inputs.size()),
6541       *plugin);
6542   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6543   params->converter->SetLayerName(layer, node_def, "plugin");
6544 
6545   // Set plugin outputs
6546   ITensorProxyPtr output_nmsed_boxes = layer->getOutput(1);
6547 
6548   // TensorRT fixes (removes) the extra last dimension in CombinedNMS outputs
6549   ITensorProxyPtr output_num_detections = layer->getOutput(0);
6550   ITensorProxyPtr output_nmsed_scores = layer->getOutput(2);
6551   ITensorProxyPtr output_nmsed_classes = layer->getOutput(3);
6552 
6553   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
6554   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_scores));
6555   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_classes));
6556   params->outputs->push_back(TRT_TensorOrWeights(output_num_detections));
6557 
6558   return Status::OK();
6559 }
6560 #endif  // IS_TRT_VERSION_GE(7, 1, 3, 0)
6561 
ConvertResize(OpConverterParams * params)6562 Status ConvertResize(OpConverterParams* params) {
6563   const auto& inputs = params->inputs;
6564   const auto& node_def = params->node_def;
6565   TF_RETURN_IF_ERROR(
6566       CheckInputsWeights(*params, {{"input", false}, {"size", true}}));
6567   TF_RETURN_IF_ERROR(AllowDataTypes(
6568       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
6569 
6570   // Get input tensor. Transpose it from NHWC to NCHW.
6571   ITensorProxyPtr inputs_tensor = inputs.at(0).tensor();
6572 
6573   TFTRT_RETURN_ERROR_IF_NULLPTR(inputs_tensor, params->node_def.name());
6574 
6575   // Get output size. It must constain two values i.e. [H_out, W_out]
6576   TRT_ShapedWeights weights = inputs.at(1).weights();
6577   if (weights.count() != 2) {
6578     return errors::Unimplemented("Resize to shape=[] is not supported, at ",
6579                                  node_def.name());
6580   }
6581   const int* weights_ptr = static_cast<int*>(weights.GetValues());
6582 
6583   // Verify and consume node attributes.
6584   TFAttrs attrs(node_def);
6585   bool align_corners = attrs.get<bool>("align_corners");
6586   TF_RETURN_IF_ERROR(
6587       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6588 
6589   // Verify resize mode. Initialize resize mode if supported.
6590   nvinfer1::ResizeMode resize_mode;
6591   if (node_def.op() == "ResizeBilinear") {
6592 #if IS_TRT_VERSION_GE(7, 1, 0, 0)
6593     if (!align_corners) {
6594       return errors::InvalidArgument(
6595           "Cannot Convert Bilinear Resize when align_corners=False");
6596     }
6597 #endif
6598     resize_mode = nvinfer1::ResizeMode::kLINEAR;
6599   } else if (node_def.op() == "ResizeNearestNeighbor") {
6600     resize_mode = nvinfer1::ResizeMode::kNEAREST;
6601   } else {
6602     return errors::Unimplemented(node_def.op(), " is not yet implemented at ",
6603                                  node_def.name());
6604   }
6605 
6606   // Validate inputs_tensor.
6607   // TODO: Allow dynamic shape for input-1 when shape input tensors are handled.
6608   const auto inputs_dims = inputs_tensor->getDimensions();
6609   if (!params->use_implicit_batch && !HasStaticShape(inputs_dims)) {
6610     return errors::Unimplemented(
6611         "TensorRT IResizeLayer requires input with static shape");
6612   }
6613 
6614   // return after validation if only validation is requested.
6615   if (params->validation_only) return Status::OK();
6616 
6617   // Transpose tensor from NHWC to NCHW format.
6618   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
6619       inputs_tensor, {0, 3, 1, 2}, &inputs_tensor, node_def, "to_NCHW"));
6620 
6621   // Calculate output dimensions.
6622   // Given input dimensions [N, C, H, W] and output size [H_out, W_out],
6623   // output dimensions equals [N, C, H_out, W_out]
6624   nvinfer1::Dims output_dimensions;
6625   output_dimensions.nbDims = inputs_tensor->getDimensions().nbDims;
6626   for (int i = 0; i < output_dimensions.nbDims; ++i) {
6627     output_dimensions.d[i] = inputs_tensor->getDimensions().d[i];
6628   }
6629   output_dimensions.d[output_dimensions.nbDims - 2] = weights_ptr[0];
6630   output_dimensions.d[output_dimensions.nbDims - 1] = weights_ptr[1];
6631 
6632   // Add resize layer.
6633   nvinfer1::IResizeLayer* layer =
6634       params->converter->network()->addResize(*inputs_tensor->trt_tensor());
6635   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6636   params->converter->SetLayerName(layer, node_def);
6637 
6638   // Set layer parameters.
6639   layer->setResizeMode(resize_mode);
6640   layer->setOutputDimensions(output_dimensions);
6641   layer->setAlignCorners(align_corners);
6642 
6643   // Get output tensor. Transpose it from NCHW to NHWC.
6644   ITensorProxyPtr output = layer->getOutput(0);
6645 
6646   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
6647       output, {0, 2, 3, 1}, &output, node_def, "to_NHWC"));
6648   params->outputs->push_back(TRT_TensorOrWeights(output));
6649   // Success
6650   return Status::OK();
6651 }  // ConvertResize
6652 
ConvertAddN(OpConverterParams * params)6653 Status ConvertAddN(OpConverterParams* params) {
6654   const auto& inputs = params->inputs;
6655   const auto& node_def = params->node_def;
6656   TF_RETURN_IF_ERROR(
6657       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
6658   TFAttrs attrs(node_def);
6659   const int num_inputs = attrs.get<int64>("N");
6660   if (num_inputs < 2) {
6661     return errors::InvalidArgument("AddN requires at least two inputs, at ",
6662                                    node_def.name());
6663   }
6664   if (inputs.size() != num_inputs) {
6665     return errors::InvalidArgument("Got ", inputs.size(),
6666                                    " inputs but expected ", num_inputs, ", at ",
6667                                    node_def.name());
6668   }
6669   for (const auto& input : inputs) {
6670     if (!input.is_tensor() && input.weights().shape_.d[0] != 1) {
6671       return errors::InvalidArgument(
6672           "Weights input to AddN is required to have batch dimension 1.");
6673     }
6674   }
6675   if (params->validation_only) return Status::OK();
6676 
6677   // AddN doesn't support broadcast.
6678   std::vector<ITensorProxyPtr> tensor_inputs;
6679   for (const auto& input : inputs) {
6680     if (input.is_tensor()) {
6681       tensor_inputs.push_back(input.tensor());
6682     } else {
6683       auto dims = input.weights().shape_;
6684       TF_RETURN_IF_ERROR(RemoveBatchDimension(&dims));
6685       tensor_inputs.push_back(
6686           params->converter->CreateConstantLayer(input.weights(), dims));
6687     }
6688   }
6689   ITensorProxyPtr lhs = tensor_inputs[0];
6690   for (int i = 1; i < num_inputs; ++i) {
6691     ITensorProxyPtr rhs = tensor_inputs[i];
6692     nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
6693         *lhs->trt_tensor(), *rhs->trt_tensor(),
6694         nvinfer1::ElementWiseOperation::kSUM);
6695     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
6696     params->converter->SetLayerName(layer, node_def, std::to_string(i));
6697     lhs = layer->getOutput(0);
6698   }
6699   params->outputs->push_back(TRT_TensorOrWeights(lhs));
6700   return Status::OK();
6701 }
6702 
RegisterValidatableOpConverters(std::unordered_map<string,OpConverter> * registration)6703 static void RegisterValidatableOpConverters(
6704     std::unordered_map<string, OpConverter>* registration) {
6705   (*registration)["BiasAdd"] = ConvertBiasAdd;
6706   (*registration)["ClipByValue"] = ConvertClipByValue;
6707 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6708   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
6709 #endif
6710   (*registration)["AddN"] = ConvertAddN;
6711   (*registration)["Cast"] = ConvertCast;
6712   (*registration)["ConcatV2"] = ConvertConcat;
6713   (*registration)["Const"] = ConvertConst;
6714   (*registration)["Conv2D"] = ConvertConv2D;
6715   (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
6716   (*registration)["DepthToSpace"] = ConvertDepthSpaceShuffle;
6717   (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
6718 #if IS_TRT_VERSION_GE(7, 1, 3, 0)
6719   (*registration)["Einsum"] = ConvertEinsum;
6720 #endif
6721   (*registration)["ExpandDims"] = ConvertExpandDims;
6722   (*registration)["FusedConv2DBiasActivation"] =
6723       ConvertFusedConv2DBiasActivation;
6724   (*registration)["GatherV2"] = ConvertGather;
6725   (*registration)["LeakyRelu"] = ConvertLeakyRelu;
6726   (*registration)["MatMul"] = ConvertMatMul;
6727   (*registration)["Pack"] = ConvertPack;
6728   (*registration)["Pad"] = ConvertPad;
6729   (*registration)["Relu6"] = ConvertRelu6;
6730   (*registration)["Reshape"] = ConvertReshape;
6731   (*registration)["Conv3D"] = ConvertConv3D;
6732   (*registration)["Conv3DBackpropInputV2"] = ConvertConv3DBackpropInputV2;
6733   for (auto resize_mode : {"ResizeBilinear", "ResizeNearestNeighbor"}) {
6734     (*registration)[resize_mode] = ConvertResize;
6735   }
6736   for (auto pool_op_type : {"AvgPool3D", "MaxPool3D"}) {
6737     (*registration)[pool_op_type] = ConvertPool3D;
6738   }
6739   (*registration)["Shape"] = ConvertShape;
6740   (*registration)["Rsqrt"] = ConvertRsqrt;
6741   (*registration)["Slice"] = ConvertSlice;
6742   (*registration)["Softmax"] = ConvertSoftmax;
6743   (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle;
6744   (*registration)["Split"] = ConvertSplit;
6745   (*registration)["Square"] = ConvertSquare;
6746   (*registration)["SquaredDifference"] = ConvertSquaredDifference;
6747   (*registration)["Squeeze"] = ConvertSqueeze;
6748   (*registration)["StridedSlice"] = ConvertStridedSlice;
6749   (*registration)["TopKV2"] = ConvertTopK;
6750   (*registration)["Transpose"] = ConvertTranspose;
6751   (*registration)["Unpack"] = ConvertUnpack;
6752   (*registration)["_CopyFromHostToGpu"] = ConvertIdentity;
6753   for (auto quantization_op_type : *TrtNodeValidator::quantize_ops) {
6754     (*registration)[quantization_op_type] = ConvertQuantize;
6755   }
6756   for (const auto& binary_op_pair : *BinaryOperationMap()) {
6757     (*registration)[binary_op_pair.first] = ConvertBinary;
6758   }
6759   for (const auto& activation_op_pair : *ActivationTypeMap()) {
6760     (*registration)[activation_op_pair.first] = ConvertActivation;
6761   }
6762   for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
6763     (*registration)[pool_op_type] = ConvertPool;
6764   }
6765   for (auto normalization_op_type :
6766        {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3"}) {
6767     (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
6768   }
6769   for (const auto& unary_op_pair : *UnaryOperationMap()) {
6770     (*registration)[unary_op_pair.first] = ConvertUnary;
6771   }
6772   for (auto reduce_op_type : {"Sum", "Prod", "Max", "Min", "Mean"}) {
6773     (*registration)[reduce_op_type] = ConvertReduce;
6774   }
6775   for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) {
6776     (*registration)[arg_minmax_type] = ConvertArgMinMax;
6777   }
6778   // The following are no-ops during inference and will not be mapped to any TRT
6779   // layer.
6780   for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) {
6781     (*registration)[identity_op_type] = ConvertIdentity;
6782   }
6783   for (auto batch_matmul_type : {"BatchMatMul", "BatchMatMulV2"}) {
6784     (*registration)[batch_matmul_type] = ConvertBatchMatMul;
6785   }
6786 }
6787 
RegisterOpValidators()6788 void TrtNodeValidator::RegisterOpValidators() {
6789   RegisterValidatableOpConverters(&op_validators_);
6790 }
6791 
RegisterOpConverters()6792 void Converter::RegisterOpConverters() {
6793   RegisterValidatableOpConverters(&op_registry_);
6794 }
6795 
ConvertGraphDefToEngine(const GraphDef & gdef,TrtPrecisionMode precision_mode,int max_batch_size,size_t max_workspace_size_bytes,const std::vector<PartialTensorShape> & input_shapes,nvinfer1::ILogger * trt_logger,nvinfer1::IGpuAllocator * allocator,TRTInt8Calibrator * calibrator,TrtUniquePtrType<nvinfer1::ICudaEngine> * engine,bool use_calibration,const bool use_implicit_batch,bool * convert_successfully,TrtShapeOptimizationProfile * profiles,absl::string_view engine_name)6796 Status ConvertGraphDefToEngine(
6797     const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
6798     size_t max_workspace_size_bytes,
6799     const std::vector<PartialTensorShape>& input_shapes,
6800     nvinfer1::ILogger* trt_logger, nvinfer1::IGpuAllocator* allocator,
6801     TRTInt8Calibrator* calibrator,
6802     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
6803     const bool use_implicit_batch, bool* convert_successfully,
6804     TrtShapeOptimizationProfile* profiles, absl::string_view engine_name) {
6805   engine->reset();
6806   if (convert_successfully) *convert_successfully = false;
6807 
6808   // Creating converter, TensorRT builder and network
6809   auto statusor = Converter::Create(precision_mode, use_calibration, trt_logger,
6810                                     use_implicit_batch, engine_name);
6811   TF_RETURN_IF_ERROR(statusor.status());
6812   auto converter = std::move(statusor.ValueOrDie());
6813 
6814   VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers";
6815   std::vector<Converter::EngineOutputInfo> output_tensors;
6816   int num_layers = converter->network()->getNbLayers();
6817   absl::flat_hash_set<const char*> layer_names;
6818   // Graph nodes are already topologically sorted during construction
6819   for (const auto& node_def : gdef.node()) {
6820     const string& node_name = node_def.name();
6821     VLOG(2) << "Converting node " << node_name << ", op=" << node_def.op();
6822     if (IsEngineInput(node_name)) {
6823       int32 slot_number = -1;
6824       string type_key;
6825       if (node_def.op() == "Placeholder") {
6826         if (!strings::safe_strto32(  // non-absl ok
6827                 node_name.c_str() + strlen(IONamePrefixes::kInputPHName),
6828                 &slot_number)) {
6829           return errors::InvalidArgument("Failed to parse slot number from ",
6830                                          node_name);
6831         }
6832         type_key = "dtype";
6833       } else if (tensorflow::grappler::IsArg(node_def)) {
6834         // Maybe remove the dependence on grappler and re-implement IsArg,
6835         // which is pretty simple (but could change if new Arg nodes are added)
6836         slot_number = node_def.attr().at("index").i();
6837         type_key = "T";
6838       } else {
6839         return errors::InvalidArgument(
6840             "Node ", node_name,
6841             " with is neither Placeholder nor Arg, instead ", node_def.op());
6842       }
6843       nvinfer1::DataType trt_dtype;
6844       nvinfer1::Dims trt_dims;
6845       int batch_size = -1;
6846       auto shape = input_shapes.at(slot_number);
6847       auto status = ValidateTensorProperties(
6848           node_def.op(), node_def.attr().at(type_key).type(), shape,
6849           use_implicit_batch, /*validation_only=*/false, &trt_dtype, &trt_dims,
6850           &batch_size);
6851       if (!status.ok()) {
6852         const string error_message =
6853             StrCat("Validation failed for ", node_name, " and input slot ",
6854                    slot_number, ": ", status.error_message());
6855         LOG_WARNING_WITH_PREFIX << error_message;
6856         return Status(status.code(), error_message);
6857       }
6858       VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
6859               << DebugString(trt_dims);
6860       // TODO(laigd): the conversion should always happen at runtime where all
6861       // the shapes are known, and we can provide a mode to generate the
6862       // engines offline, by calling sess.run() and cache/serialize the engines.
6863       TF_RETURN_IF_ERROR(converter->AddInputTensor(node_name, trt_dtype,
6864                                                    trt_dims, batch_size));
6865     } else if (IsEngineOutput(node_name)) {
6866       int32 slot_number = -1;
6867       if (node_def.op() == "Identity") {
6868         if (!strings::safe_strto32(  // non-absl ok
6869                 node_name.c_str() + strlen(IONamePrefixes::kOutputPHName),
6870                 &slot_number)) {
6871           return errors::InvalidArgument("Failed to parse slot number from ",
6872                                          node_name);
6873         }
6874       } else if (tensorflow::grappler::IsRetval(node_def)) {
6875         slot_number = node_def.attr().at("index").i();
6876       } else {
6877         return errors::InvalidArgument(
6878             "Node with name ", node_name,
6879             " starting with IONamePrefixes::kOutputPHName is "
6880             "neither Identity nor Retval, instead ",
6881             node_def.op());
6882       }
6883       // Get output type that TensorFlow expects
6884       TFAttrs attrs(node_def);
6885       DataType tf_dtype = attrs.get<DataType>("T");
6886       nvinfer1::DataType trt_dtype;
6887       TF_RETURN_IF_ERROR(TfTypeToTrtType(tf_dtype, &trt_dtype));
6888       if (output_tensors.size() <= slot_number) {
6889         output_tensors.resize(slot_number + 1);
6890       }
6891       output_tensors.at(slot_number) = {node_def.input(0), node_name,
6892                                         trt_dtype};
6893     } else {
6894       TF_RETURN_IF_ERROR(converter->ConvertNode(node_def));
6895     }
6896 
6897     // To support TF-TRT profiling, we ensure each ILayer has a non-empty name.
6898     // BuildCudaEngine returns an error if there is any ILayer name collision.
6899     // We want to report the error here before BuildCudaEngine in a more
6900     // meaningful way.
6901     int new_num_layers = converter->network()->getNbLayers();
6902     for (int i = num_layers; i < new_num_layers; i++) {
6903       auto layer = converter->network()->getLayer(i);
6904       if (layer->getName() == nullptr ||
6905           !layer_names.insert(layer->getName()).second) {
6906         std::string error_message =
6907             absl::StrCat("Converting node ", node_name, ", op=", node_def.op(),
6908                          layer->getName() ? "create a layer with name collision"
6909                                           : "create a layer without a name");
6910         LOG_WARNING_WITH_PREFIX << error_message;
6911         return errors::Internal(error_message);
6912       }
6913     }
6914     num_layers = new_num_layers;
6915   }
6916   TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors));
6917   if (convert_successfully) *convert_successfully = true;
6918 
6919   // Apply user provided quantization ranges to tensors
6920   converter->MaybeApplyQuantizationRanges();
6921 
6922   // Build the engine.
6923   TF_RETURN_IF_ERROR(converter->BuildCudaEngine(
6924       engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator,
6925       profiles));
6926 
6927   VLOG(1) << "Finished conversion";
6928   return Status::OK();
6929 }
6930 
ConvertSegmentToGraphDef(const Graph * graph,const grappler::GraphProperties & graph_properties,const std::vector<const Node * > & subgraph_nodes,EngineInfo * engine_info)6931 Status ConvertSegmentToGraphDef(
6932     const Graph* graph, const grappler::GraphProperties& graph_properties,
6933     const std::vector<const Node*>& subgraph_nodes,  // In topological order
6934     EngineInfo* engine_info) {
6935   std::vector<EngineConnection>* connections = &engine_info->connections;
6936   GraphDef* segment_def = &engine_info->segment_graph_def;
6937   bool has_int32_input = false;
6938   std::set<string> marker_nodes;
6939   // Update connection shapes/data types and add corresponding input/output
6940   // nodes in the segment graphdef.
6941   for (size_t i = 0; i < connections->size(); ++i) {
6942     auto& connection = connections->at(i);
6943     if (connection.is_control_edge()) continue;
6944     auto outside_node = graph->FindNodeId(connection.outside_id);
6945     if (!outside_node) {
6946       // This should never happen, unless the original graph is problematic.
6947       return errors::NotFound("Cannot find node with id ",
6948                               connection.outside_id, " in the graph.");
6949     }
6950     // Updates the shape and data types of input/output connections.
6951     DataType dtype;
6952     PartialTensorShape partial_shape;
6953     if (connection.is_input_edge) {
6954       GetOutputProperties(graph_properties,
6955                           graph->FindNodeId(connection.outside_id),
6956                           connection.outside_port, &partial_shape, &dtype);
6957       connection.outside_shape = partial_shape;
6958     } else {
6959       GetInputProperties(graph_properties,
6960                          graph->FindNodeId(connection.outside_id),
6961                          connection.outside_port, &partial_shape, &dtype);
6962       connection.inside_shape = partial_shape;
6963     }
6964     connection.connection_type = dtype;
6965 
6966     // Add dummy input/output nodes to the segment graphdef.
6967     if (connection.is_input_edge) {
6968       if (dtype == DT_INT32 && !has_int32_input) {
6969         has_int32_input = true;
6970       }
6971 
6972       const string node_name =
6973           StrCat(IONamePrefixes::kInputPHName, connection.port_number);
6974       if (marker_nodes.count(node_name)) {
6975         VLOG(1) << "Reusing input " << node_name << " for the edge "
6976                 << connection.outside_node_name << ":"
6977                 << connection.outside_port << " -> "
6978                 << connection.inside_node_name << ":" << connection.inside_port;
6979         continue;
6980       }
6981       marker_nodes.insert(node_name);
6982       auto seg_node = segment_def->add_node();
6983       NodeDefBuilder builder(node_name, "_Arg");
6984       auto status = builder.Attr("shape", partial_shape)
6985                         .Attr("T", dtype)
6986                         .Attr("index", connection.port_number)
6987                         .Finalize(seg_node);
6988       VLOG(1) << "Constructing input " << node_name << " for the edge "
6989               << connection.outside_node_name << ":" << connection.outside_port
6990               << " -> " << connection.inside_node_name << ":"
6991               << connection.inside_port;
6992     } else {
6993       const string node_name =
6994           StrCat(IONamePrefixes::kOutputPHName, connection.port_number);
6995       if (marker_nodes.count(node_name)) {
6996         VLOG(1) << "Reusing output " << node_name << " for the edge "
6997                 << connection.inside_node_name << ":" << connection.inside_port
6998                 << " -> " << connection.outside_node_name << ":"
6999                 << connection.outside_port;
7000         continue;
7001       }
7002       marker_nodes.insert(node_name);
7003       auto seg_node = segment_def->add_node();
7004       NodeDefBuilder builder(node_name, "_Retval");
7005       auto status =
7006           builder.Attr("T", dtype)
7007               .Attr("index", connection.port_number)
7008               .Input(connection.inside_node_name, connection.inside_port, dtype)
7009               .Finalize(seg_node);
7010       VLOG(1) << "Constructing output " << node_name << " for the edge "
7011               << connection.inside_node_name << ":" << connection.inside_port
7012               << " -> " << connection.outside_node_name << ":"
7013               << connection.outside_port;
7014     }
7015   }  // for each connection.
7016 
7017   std::set<string> subgraph_node_names;
7018   for (const Node* node : subgraph_nodes) {
7019     subgraph_node_names.insert(node->name());
7020   }
7021 
7022   std::unordered_map<int, int> old_to_new_id_map;
7023   // Copy internal nodes to new graphdef
7024   string local_scope = subgraph_nodes.front()->name();
7025   for (const Node* node : subgraph_nodes) {
7026     local_scope = GetCommonNameScope(local_scope, node->name());
7027     old_to_new_id_map[node->id()] = segment_def->node_size();
7028     auto snode = segment_def->add_node();
7029     *snode = node->def();
7030     if (snode->op() == "Shape") {
7031       const std::string copy_op_name = snode->name();
7032       std::string shape_op_name = copy_op_name + "_cpu_result";
7033 
7034       // Add a node to copy the Shape OP output to GPU. Use the Shape OP node
7035       // name for this new node so that users switch to use the result of this
7036       // new node without having to change the name of the value they use.
7037       NodeDef* copy_op = segment_def->add_node();
7038       copy_op->set_name(copy_op_name);
7039       copy_op->set_op("_CopyFromHostToGpu");
7040       *copy_op->add_input() = shape_op_name + ":0";
7041       tensorflow::DataType type = snode->attr().at("out_type").type();
7042       AddNodeAttr("T", type, copy_op);
7043       AddNodeAttr("out_type", type, copy_op);
7044 
7045       // Rename the Shape OP node and add the new name to the set of node names
7046       // for the engine.
7047       snode->set_name(shape_op_name);
7048       subgraph_node_names.insert(shape_op_name);
7049       VLOG(2) << "Add copy node " << copy_op->DebugString();
7050     }
7051     VLOG(2) << "Copying " << snode->name() << " to subgraph";
7052   }
7053   // Update the inputs of the new input nodes to point to placeholder nodes.
7054   for (int i = 0; i < connections->size(); ++i) {
7055     auto& connection = connections->at(i);
7056     if (connection.is_control_edge() || !connection.is_input_edge) continue;
7057     auto snode =
7058         segment_def->mutable_node(old_to_new_id_map[connection.inside_id]);
7059     const string arg_name =
7060         StrCat(IONamePrefixes::kInputPHName, connection.port_number);
7061     VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
7062             << " from " << snode->input(connection.inside_port) << " to "
7063             << arg_name;
7064     snode->set_input(connection.inside_port, arg_name);
7065   }
7066 
7067   // Remove control inputs that are not inside the segment.
7068   for (int i = 0; i < segment_def->node_size(); ++i) {
7069     auto snode = segment_def->mutable_node(i);
7070     const int input_size = snode->input_size();
7071     int input_idx = 0;
7072     int actual_input_idx = 0;
7073     while (input_idx < input_size) {
7074       TensorId input = ParseTensorName(snode->input(input_idx));
7075       if (!subgraph_node_names.count(
7076               string(input.first.data(), input.first.size())) &&
7077           !IsEngineInput(input.first)) {
7078         if (input.second == Graph::kControlSlot) {
7079           VLOG(1) << "... removing control inputs " << input.first
7080                   << " from subgraph.";
7081           ++input_idx;
7082           continue;
7083         } else {
7084           return errors::InvalidArgument(
7085               "Found non control input outside the segment that is not an "
7086               "engine connection to ",
7087               snode->name(), ": ", input.first);
7088         }
7089       }
7090       if (actual_input_idx != input_idx) {
7091         snode->set_input(actual_input_idx, snode->input(input_idx));
7092       }
7093       ++input_idx;
7094       ++actual_input_idx;
7095     }
7096     for (int remove = input_size - actual_input_idx; remove > 0; --remove) {
7097       snode->mutable_input()->RemoveLast();
7098     }
7099   }
7100   engine_info->engine_name = StrCat(local_scope, engine_info->engine_name);
7101   engine_info->has_int32_input = has_int32_input;
7102   return Status::OK();
7103 }
7104 
operator ()(const Edge * out_edge) const7105 bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
7106   if (out_edge->IsControlEdge()) return true;
7107   if (out_edge->src()->type_string() == "Const") {
7108     VLOG(1) << "--> Need to remove output node " << out_edge->src()->name()
7109             << " which is a Const.";
7110     return false;
7111   }
7112   return true;
7113 }
7114 
7115 }  // namespace convert
7116 }  // namespace tensorrt
7117 }  // namespace tensorflow
7118 
7119 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
7120