• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/extendrt/delegate/tensorrt/tensorrt_subgraph.h"
18 #include <cuda_runtime_api.h>
19 #include <string>
20 #include <vector>
21 #include <set>
22 #include <queue>
23 #include <algorithm>
24 #include <numeric>
25 #include <functional>
26 #include <fstream>
27 #include <limits>
28 #include <unordered_map>
29 #include <iomanip>
30 #include "src/extendrt/delegate/delegate_utils.h"
31 #include "src/extendrt/delegate/tensorrt/tensorrt_utils.h"
32 #include "src/common/utils.h"
33 #include "ops/auto_generate/gen_lite_ops.h"
34 #include "ops/fusion/topk_fusion.h"
35 
36 namespace mindspore::lite {
TensorRTSubGraph(std::vector<TensorRTOp * > ops,const std::vector<TensorInfo> & inputs,const std::vector<TensorInfo> & outputs,const mindspore::Context * ctx,std::shared_ptr<GPUDeviceInfo> device_info,TensorRTRuntime * runtime,bool support_resize,bool support_hw_resize,const ProfileConfigs & trt_profile_config)37 TensorRTSubGraph::TensorRTSubGraph(std::vector<TensorRTOp *> ops, const std::vector<TensorInfo> &inputs,
38                                    const std::vector<TensorInfo> &outputs, const mindspore::Context *ctx,
39                                    std::shared_ptr<GPUDeviceInfo> device_info, TensorRTRuntime *runtime,
40                                    bool support_resize, bool support_hw_resize,
41                                    const ProfileConfigs &trt_profile_config)
42     : inputs_(inputs),
43       outputs_(outputs),
44       all_ops_(std::move(ops)),
45       device_info_(device_info),
46       runtime_(runtime),
47       trt_profile_config_(trt_profile_config) {
48   trt_specific_weight_handled_inner_ = {
49     ops::kNameTranspose, ops::kNameReshape, ops::kNameExpandDims, ops::kNameTopKFusion, ops::kNameBroadcastTo,
50   };
51   if (!support_resize) {
52     input_batchsize_index_ = -1;
53     input_hw_index_ = -1;
54   }
55   if (!support_hw_resize) {
56     input_hw_index_ = -1;
57   }
58 }
59 
~TensorRTSubGraph()60 TensorRTSubGraph::~TensorRTSubGraph() {
61   if (ctx_ != nullptr) {
62     delete ctx_;
63   }
64   if (config_ != nullptr) {
65     config_->destroy();
66     config_ = nullptr;
67   }
68 #ifdef PROFILER_
69   auto profile = dynamic_cast<SimpleProfiler *>(trt_context_->getProfiler());
70   if (profile != nullptr) std::cout << *profile << std::endl;
71   delete profile;
72 #endif
73   if (trt_context_ != nullptr) {
74     trt_context_->destroy();
75     trt_context_ = nullptr;
76   }
77   if (engine_ != nullptr) {
78     engine_->destroy();
79     engine_ = nullptr;
80   }
81   if (tensor_bindings_ != nullptr) {
82     delete[] tensor_bindings_;
83     tensor_bindings_ = nullptr;
84   }
85   for (auto op : all_ops_) {
86     delete op;
87   }
88 }
89 
IsValidProfileDims() const90 bool TensorRTSubGraph::IsValidProfileDims() const {
91   if (trt_profile_config_.profiles.empty()) {
92     MS_LOG(INFO) << "Number of profiles is 0.";
93     return false;
94   }
95   for (auto &profile : trt_profile_config_.profiles) {
96     if (profile.inputs.size() != trt_profile_config_.input_infos.size()) {
97       MS_LOG(WARNING) << "Profile input size " << profile.inputs.size() << " != input shape size "
98                       << trt_profile_config_.input_infos.size();
99       return false;
100     }
101     for (size_t i = 0; i < profile.inputs.size(); i++) {
102       const auto &profile_input = profile.inputs[i];
103       const auto &input_info = trt_profile_config_.input_infos[i];
104       if (profile_input.min_dims.size() != input_info.input_shape.size()) {
105         MS_LOG(WARNING) << "Profile input " << input_info.name << " min dims number " << profile_input.min_dims.size()
106                         << " != input shape dim number " << input_info.input_shape.size();
107         return false;
108       }
109       if (profile_input.max_dims.size() != input_info.input_shape.size()) {
110         MS_LOG(WARNING) << "Profile input " << input_info.name << " max dims number " << profile_input.max_dims.size()
111                         << " != input shape dim number " << input_info.input_shape.size();
112         return false;
113       }
114       if (profile_input.opt_dims.size() != input_info.input_shape.size()) {
115         MS_LOG(WARNING) << "Profile input " << input_info.name << " opt dims number " << profile_input.opt_dims.size()
116                         << " != input shape dim number " << input_info.input_shape.size();
117         return false;
118       }
119     }
120   }
121   return true;
122 }
123 
Init(cudaStream_t stream,cublasHandle_t cublas_handle,cublasLtHandle_t cublaslt_handle)124 int TensorRTSubGraph::Init(cudaStream_t stream, cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle) {
125   auto ret = GetGraphInOutOps(inputs_, outputs_, &in_ops_, &out_ops_, all_ops_);
126   if (ret != RET_OK) {
127     MS_LOG(ERROR) << "Get TensorRT subgraph input and output ops failed.";
128     return RET_ERROR;
129   }
130   ctx_ = new (std::nothrow) TensorRTContext();
131   if (ctx_ == nullptr) {
132     MS_LOG(ERROR) << "New TensorRTContext failed.";
133     return RET_ERROR;
134   }
135   ctx_->SetRuntime(runtime_);
136   if (!ctx_->Init()) {
137     MS_LOG(ERROR) << "New TensorRTContext failed.";
138     return RET_ERROR;
139   }
140   if (SetDeviceConfig(stream, cublas_handle, cublaslt_handle) != RET_OK) {
141     MS_LOG(WARNING) << "set tensorrt config failed.";
142   }
143   serializer_ = std::make_shared<TensorRTSerializer>(serialize_file_path_);
144   if (serializer_ == nullptr) {
145     MS_LOG(ERROR) << "create Serializer failed.";
146     return RET_ERROR;
147   }
148   using_input_ranges_ = IsValidProfileDims();
149   if (using_input_ranges_) {
150     for (size_t i = 0; i != trt_profile_config_.profiles.size(); ++i) {
151       profiles_.push_back(runtime_->GetBuilder()->createOptimizationProfile());
152     }
153   } else {
154     profiles_.push_back(runtime_->GetBuilder()->createOptimizationProfile());
155   }
156   for (size_t i = 0; i != profiles_.size(); ++i) {
157     if (profiles_[i] == nullptr) {
158       MS_LOG(ERROR) << "create optimization profile failed.";
159       return RET_ERROR;
160     }
161   }
162   engine_ = serializer_->GetSerializedEngine();
163   if (engine_ != nullptr) {
164     MS_LOG(INFO) << "using serialized engine " << serialize_file_path_;
165     return RET_OK;
166   }
167   for (size_t i = 0; i < inputs_.size(); i++) {
168     if (inputs_[i].Shape().size() != DIMENSION_4D) {
169       input_hw_index_ = -1;
170     }
171   }
172   return RET_OK;
173 }
174 
BuildEngine()175 int TensorRTSubGraph::BuildEngine() {
176   // print all network ops
177   for (auto &profile : profiles_) {
178     if (this->config_->addOptimizationProfile(profile) == -1) {
179       MS_LOG(ERROR) << "addOptimizationProfile failed.";
180       return RET_ERROR;
181     }
182   }
183   MS_LOG(INFO) << "build engine for tensorrt network: " << ctx_->network()->getName();
184   for (int i = 0; i < ctx_->network()->getNbLayers(); i++) {
185     MS_LOG(DEBUG) << "tensorrt op: " << ctx_->network()->getLayer(i)->getName();
186   }
187   MS_LOG(DEBUG) << "end of tensorrt network: " << ctx_->network()->getName();
188 
189   this->engine_ = runtime_->GetBuilder()->buildEngineWithConfig(*ctx_->network(), *this->config_);
190   if (this->engine_ == nullptr) {
191     MS_LOG(ERROR) << "Create engine failed in TensorRT network";
192     return RET_ERROR;
193   }
194   if (serialize_file_path_.size() > 0) {
195     serializer_->SaveSerializedEngine(engine_);
196   }
197   return RET_OK;
198 }
199 
SetDeviceConfig(cudaStream_t stream,cublasHandle_t cublas_handle,cublasLtHandle_t cublaslt_handle)200 int TensorRTSubGraph::SetDeviceConfig(cudaStream_t stream, cublasHandle_t cublas_handle,
201                                       cublasLtHandle_t cublaslt_handle) {
202   if (config_ == nullptr) {
203     this->config_ = runtime_->GetBuilder()->createBuilderConfig();
204     if (this->config_ == nullptr) {
205       MS_LOG(ERROR) << "create builder config failed.";
206       return RET_ERROR;
207     }
208   }
209   // set fp16
210   if (device_info_->GetEnableFP16() && runtime_->GetBuilder()->platformHasFastFp16()) {
211     MS_LOG(INFO) << "set fp16 flag successfully for tensorrt.";
212     config_->setFlag(nvinfer1::BuilderFlag::kFP16);
213     runtime_->SetRuntimePrecisionMode(RuntimePrecisionMode_FP16);
214   }
215 
216   // set int8
217   if (IsInt8Mode() && runtime_->GetBuilder()->platformHasFastInt8()) {
218     MS_LOG(INFO) << "set int8 flag successfully for tensorrt.";
219     config_->setFlag(nvinfer1::BuilderFlag::kINT8);
220     // Mark calibrator as null
221     config_->setInt8Calibrator(nullptr);
222     input_hw_index_ = -1;
223   } else {
224     MS_LOG(INFO) << "inputs no quant params or platform not support int8.";
225   }
226   runtime_->SetCudaStream(stream, cublas_handle, cublaslt_handle);
227   config_->setProfileStream(stream);
228   stream_ = stream;
229 
230   MS_LOG(INFO) << GetRankID() << " tensorrt subgraph stream: " << stream_;
231 
232   // config setMaxWorkspaceSize to 2100 MB for max limit
233   constexpr size_t kWorkspaceSize = static_cast<size_t>(2100) * (1 << 20);
234   config_->setMaxWorkspaceSize(kWorkspaceSize);
235   return RET_OK;
236 }
237 
IsInt8Mode()238 bool TensorRTSubGraph::IsInt8Mode() {
239   for (auto cur_op : all_ops_) {
240     if (cur_op->GetQuantType() == schema::QuantType_QUANT_ALL) {
241       return true;
242     }
243   }
244   return false;
245 }
246 
SetTensorRTNetworkInput(const TensorInfo & in_tensor,int index)247 nvinfer1::ITensor *TensorRTSubGraph::SetTensorRTNetworkInput(const TensorInfo &in_tensor, int index) {
248   if (index < 0) {
249     return nullptr;
250   }
251   for (int i = 0; i < ctx_->network()->getNbInputs(); i++) {
252     if (in_tensor.Name().compare(ctx_->network()->getInput(i)->getName()) == 0) {
253       MS_LOG(INFO) << "input tensor is already added in network: " << in_tensor.Name();
254       return ctx_->network()->getInput(i);
255     }
256   }
257 
258   auto cuda_dtype = ConvertDataType(in_tensor.DataType());
259   if (static_cast<int>(cuda_dtype) == -1) {
260     MS_LOG(ERROR) << "Unsupported input data type " << static_cast<int>(in_tensor.DataType());
261     return nullptr;
262   }
263   nvinfer1::Dims input_dims;
264   if (using_input_ranges_) {
265     input_dims = SetInputDimsProfile(in_tensor, index);
266   } else {
267     input_dims = ParseInputDimsProfile(in_tensor, index);
268   }
269   MS_LOG(INFO) << "add network input: " << in_tensor.Name();
270   return ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims);
271 }
272 
SetInputDimsProfile(const TensorInfo & in_tensor,int index)273 nvinfer1::Dims TensorRTSubGraph::SetInputDimsProfile(const TensorInfo &in_tensor, int index) {
274   auto input_info = trt_profile_config_.input_infos[index];
275   auto input_dims = ConvertCudaDims(input_info.input_shape);
276   DebugDims("input dims", input_dims);
277   for (size_t i = 0; i < trt_profile_config_.profiles.size(); i++) {
278     auto &profile = trt_profile_config_.profiles[i];
279     auto min_dims = ConvertCudaDims(profile.inputs[index].min_dims);
280     if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kMIN, min_dims)) {
281       MS_LOG(ERROR) << "setDimensions of kMIN failed for " << input_info.name;
282       return input_dims;
283     }
284     auto opt_dims = ConvertCudaDims(profile.inputs[index].opt_dims);
285     if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kOPT, opt_dims)) {
286       MS_LOG(ERROR) << "setDimensions of kOPT failed for " << input_info.name;
287       return input_dims;
288     }
289 
290     auto max_dims = ConvertCudaDims(profile.inputs[index].max_dims);
291     if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kMAX, max_dims)) {
292       MS_LOG(ERROR) << "setDimensions of kMAX failed for " << input_info.name;
293       return input_dims;
294     }
295     DebugDims("min dims", min_dims);
296     DebugDims("opt dims", opt_dims);
297     DebugDims("max dims", max_dims);
298   }
299   return input_dims;
300 }
301 
ParseInputDimsProfile(const TensorInfo & in_tensor,int index)302 nvinfer1::Dims TensorRTSubGraph::ParseInputDimsProfile(const TensorInfo &in_tensor, int index) {
303   nvinfer1::Dims input_dims = ConvertCudaDims(in_tensor.Shape());
304   nvinfer1::Dims input_dims_min = ConvertCudaDims(in_tensor.Shape());
305   if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims_min)) {
306     MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name();
307     return input_dims;
308   }
309   nvinfer1::Dims input_dims_opt = ConvertCudaDims(in_tensor.Shape());
310   if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims_opt)) {
311     MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name();
312     return input_dims;
313   }
314   nvinfer1::Dims input_dims_max = ConvertCudaDims(in_tensor.Shape());
315   // input_dims_max should be the same with input network dims
316   if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims_max)) {
317     MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name();
318     return input_dims;
319   }
320   if (trt_profile_config_.profiles.empty()) {
321     ProfileItem profile_item;
322     profile_item.inputs.resize(inputs_.size());
323     trt_profile_config_.profiles.push_back(profile_item);
324   }
325   auto &profile_item = trt_profile_config_.profiles.back();
326   profile_item.inputs[index].min_dims = ConvertMSShape(input_dims_min);
327   profile_item.inputs[index].opt_dims = ConvertMSShape(input_dims_opt);
328   profile_item.inputs[index].max_dims = ConvertMSShape(input_dims_max);
329 
330   DebugDims("input min dims", input_dims_min);
331   DebugDims("input opt dims", input_dims_opt);
332   DebugDims("input max dims", input_dims_max);
333   return input_dims;
334 }
335 
ParseInputsProfile()336 int TensorRTSubGraph::ParseInputsProfile() {
337   MS_LOG(INFO) << "using serialied engine.";
338   for (size_t i = 0; i < inputs_.size(); i++) {
339     auto dim = ParseInputDimsProfile(inputs_[i], i);
340     if (dim.nbDims <= 0) {
341       MS_LOG(ERROR) << "input dims is invalid.";
342       return RET_ERROR;
343     }
344   }
345   return RET_OK;
346 }
347 
GetInputIndexByName(const std::string & name)348 int TensorRTSubGraph::GetInputIndexByName(const std::string &name) {
349   for (size_t i = 0; i != inputs().size(); ++i) {
350     if (inputs()[i].Name() == name) {
351       return i;
352     }
353   }
354   return -1;
355 }
356 
BuildTensorRTGraph()357 int TensorRTSubGraph::BuildTensorRTGraph() {
358   MS_ASSERT(!all_ops_.empty());
359   int ret;
360   if (engine_ != nullptr) {
361     return ParseInputsProfile();
362   }
363   // build engine online
364   for (auto cur_op : all_ops_) {
365     cur_op->SetRuntime(runtime_);
366     for (size_t i = 0; i != cur_op->inputs().size(); ++i) {
367       // Data From CPU
368       auto in_tensor = cur_op->inputs()[i];
369       if (IsSubGraphInputTensor(this->inputs(), in_tensor)) {
370         nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(in_tensor, GetInputIndexByName(in_tensor.Name()));
371         if (trt_tensor == nullptr) {
372           MS_LOG(ERROR) << "SetTensorRTNetworkInput failed for " << in_tensor.Name();
373           return RET_ERROR;
374         }
375         // avoid bool input tensor
376         cur_op->SetSupportInputBool(false);
377 
378         ctx_->RegisterTensorWithSameName(ITensorHelper{trt_tensor, in_tensor.format(), true}, in_tensor.Name());
379         continue;
380       }
381 
382       ITensorHelper trt_tensor = FindTensorRTInputs(cur_op, in_tensor);
383       if (trt_tensor.trt_tensor_ == nullptr) {
384         // weight tensor
385         auto weight_handled_inner =
386           cur_op->IsWeightInputHanledInner() ||
387           trt_specific_weight_handled_inner_.find(cur_op->type()) != trt_specific_weight_handled_inner_.end();
388         if (!weight_handled_inner) {
389           if (!in_tensor.IsConst()) {
390             MS_LOG(ERROR) << "Weight Tensor data is not const.";
391             return RET_ERROR;
392           }
393           trt_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx_, in_tensor, cur_op->GetOpName());
394           trt_tensor.format_ = Format::NCHW;
395           MS_LOG(INFO) << "auto convert constant tensor for: " << in_tensor.Name();
396           ctx_->RegisterTensor(trt_tensor, in_tensor.Name());
397         }
398       } else {
399         ctx_->RegisterTensor(trt_tensor, in_tensor.Name());
400       }
401     }
402     MS_LOG(DEBUG) << "Parsing TensorRT op for " << cur_op->GetOpName();
403 
404     ret = cur_op->AddInnerOp(ctx_);
405     if (ret != RET_OK) {
406       MS_LOG(ERROR) << "Add op failed in TensorRT network: " << cur_op->GetOpName();
407       return RET_ERROR;
408     }
409     ret = cur_op->SetInt8DynamicRange(ctx_);
410     if (ret != RET_OK) {
411       MS_LOG(ERROR) << "Set Int8 dynamic range failed in TensorRT network: " << cur_op->GetOpName();
412       return RET_ERROR;
413     }
414   }
415   ret = MarkOutputs();
416   if (ret != RET_OK) {
417     MS_LOG(ERROR) << "MarkOutputs failed in TensorRT network";
418     return ret;
419   }
420 
421   std::string network_name = "network_" + std::string(ctx_->network()->getInput(0)->getName()) + "_" +
422                              std::string(ctx_->network()->getOutput(0)->getName());
423   ctx_->network()->setName(network_name.c_str());
424   this->name_ = network_name;
425   ret = BuildEngine();
426   if (ret != RET_OK) {
427     MS_LOG(ERROR) << "Create engine failed in TensorRT network";
428     return ret;
429   }
430   return RET_OK;
431 }
432 
MarkOutputs()433 int TensorRTSubGraph::MarkOutputs() {
434   // Mark NetWork Output Tensor.
435   for (const auto &out_tensor : outputs_) {
436     std::string output_name = out_tensor.Name();
437     auto input_it = std::find_if(inputs_.begin(), inputs_.end(),
438                                  [=](const TensorInfo &input) { return input.Name() == output_name; });
439     if (input_it != inputs_.end()) {
440       nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(*input_it, GetInputIndexByName(input_it->Name()));
441       ctx_->network()->markOutput(*trt_tensor);
442       continue;
443     }
444     if (out_tensor.IsConst()) {
445       MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
446       auto output_helper = ctx_->MsName2Tensor(out_tensor.Name());
447       if (output_helper.trt_tensor_ == nullptr) {
448         output_helper.trt_tensor_ = lite::ConvertConstantTensor(ctx_, out_tensor, out_tensor.Name());
449         output_helper.format_ = Format::NCHW;
450         MS_LOG(INFO) << "auto convert constant tensor for: " << out_tensor.Name();
451         ctx_->RegisterTensor(output_helper, out_tensor.Name());
452       }
453       nvinfer1::ITensor *out_trt_tensor = output_helper.trt_tensor_;
454       out_trt_tensor->setName(("__" + out_tensor.Name()).c_str());
455       out_trt_tensor = ctx_->network()->addIdentity(*out_trt_tensor)->getOutput(0);
456       out_trt_tensor->setName(out_tensor.Name().c_str());
457       ctx_->network()->markOutput(*out_trt_tensor);
458       for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
459         if (out_trt_tensor->getDimensions().d[n] == -1) {
460           output_batchsize_index_ = n;
461           break;
462         }
463       }
464     }
465     for (auto out_op : this->out_ops_) {
466       for (size_t index = 0; index < out_op->outputs().size(); index++) {
467         if (out_op->outputs()[index] == out_tensor) {
468           MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
469           auto output_helper = out_op->output(ctx_, index);
470           nvinfer1::ITensor *out_trt_tensor = output_helper.trt_tensor_;
471           out_trt_tensor->setName(("__" + out_tensor.Name()).c_str());
472           auto out_layer = ctx_->network()->addIdentity(*out_trt_tensor);
473           if (out_tensor.DataType() == DataType::kNumberTypeFloat16) {
474             MS_LOG(WARNING) << "Cast output tensor " << out_tensor.Name() << " to fp16";
475             out_layer->setOutputType(0, nvinfer1::DataType::kHALF);
476           }
477           out_trt_tensor = out_layer->getOutput(0);
478           out_trt_tensor->setName(out_tensor.Name().c_str());
479           ctx_->network()->markOutput(*out_trt_tensor);
480           for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
481             if (out_trt_tensor->getDimensions().d[n] == -1) {
482               output_batchsize_index_ = n;
483               break;
484             }
485           }
486         }
487       }
488     }
489   }
490   return RET_OK;
491 }
492 
Prepare()493 int TensorRTSubGraph::Prepare() {
494   int ret = lite::SetCudaDevice(device_info_);
495   if (ret != RET_OK) {
496     return ret;
497   }
498   if (this->engine_ == nullptr) {
499     MS_LOG(ERROR) << "engine_ is null in this builder_";
500     return RET_ERROR;
501   }
502   this->trt_context_ = this->engine_->createExecutionContext();
503   if (this->trt_context_ == nullptr) {
504     MS_LOG(ERROR) << "TensorRTSubGraph create context failed.";
505     return RET_ERROR;
506   }
507 
508 #ifdef PROFILER_
509   auto profiler = new SimpleProfiler("myprofiler");
510   if (profiler == nullptr) {
511     MS_LOG(WARNING) << "Cannot create profiler";
512   }
513   this->trt_context_->setProfiler(profiler);
514 #endif
515 
516   int binding_num = this->engine_->getNbBindings();
517   if (binding_num <= 0) {
518     MS_LOG(ERROR) << "TensorRTSubGraph binding num < 0.";
519     return RET_ERROR;
520   }
521   tensor_bindings_ = new (std::nothrow) void *[binding_num];
522   if (tensor_bindings_ == nullptr) {
523     MS_LOG(ERROR) << "malloc tensor binding array failed.";
524     return RET_ERROR;
525   }
526   profile_index_ = MaxVolumnProfileIndex();
527   if (this->trt_context_->setOptimizationProfile(profile_index_)) {
528     MS_LOG(INFO) << "setOptimizationProfile: " << profile_index_;
529   }
530   const auto &profile = trt_profile_config_.profiles[profile_index_];
531   for (size_t i = 0; i != inputs_.size(); ++i) {
532     auto &tensor = inputs_[i];
533     auto max_profile_dims = profile.inputs[i].max_dims;
534     tensor.SetShape(max_profile_dims);
535     int volumn = std::accumulate(max_profile_dims.begin(), max_profile_dims.end(), 1, std::multiplies<int>());
536     auto type_size = lite::DataTypeSize(static_cast<enum TypeId>(tensor.DataType()));
537     auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, volumn * type_size);
538     if (device_ptr == nullptr) {
539       MS_LOG(ERROR) << "malloc for inputs tensor device memory failed.";
540       return RET_ERROR;
541     }
542     auto tensor_name = tensor.Name();
543     trt_in_tensor_name_.push_back(tensor_name);
544     int index = GetProfileBindingIndex(tensor_name, profile_index_);
545     MS_LOG(INFO) << "device index " << index << " for tensor : " << tensor_name << " attr: " << device_ptr;
546     tensor_bindings_[index] = device_ptr;
547     nvinfer1::Dims input_dims = ConvertCudaDims(profile.inputs[i].max_dims);
548     if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
549       MS_LOG(ERROR) << "invalid input dims of " << tensor.Name();
550       return RET_ERROR;
551     }
552   }
553   if (!this->trt_context_->allInputDimensionsSpecified()) {
554     MS_LOG(ERROR) << "input dims need to be specified.";
555     return RET_ERROR;
556   }
557   for (auto op : all_ops_) {
558     ret = op->Prepare(tensor_bindings_, engine_);
559     if (ret != RET_OK) {
560       MS_LOG(ERROR) << "prepare op failed of " << op->GetOpName();
561       return RET_ERROR;
562     }
563   }
564   for (auto &tensor : outputs_) {
565     int max_index = GetProfileBindingIndex(tensor.Name(), profile_index_);
566     auto out_dims = trt_context_->getBindingDimensions(max_index);
567     int elem_num = std::accumulate(out_dims.d, out_dims.d + out_dims.nbDims, 1, std::multiplies<int>());
568     DebugDims("out dims", out_dims);
569     MS_LOG(INFO) << "Set output shape by tensorrt binding output";
570     tensor.SetShape(lite::ConvertMSShape(out_dims));
571     auto type_size = lite::DataTypeSize(static_cast<enum TypeId>(tensor.DataType()));
572     if (tensor.DataType() == DataType::kNumberTypeBool) {
573       type_size = lite::DataTypeSize(static_cast<enum TypeId>(DataType::kNumberTypeInt32));
574     }
575     auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, elem_num * type_size);
576     if (device_ptr == nullptr) {
577       MS_LOG(ERROR) << "malloc for outputs tensor device memory failed.";
578       return RET_ERROR;
579     }
580     for (size_t j = 0; j != profiles_.size(); ++j) {
581       int index = GetProfileBindingIndex(tensor.Name(), j);
582       tensor_bindings_[index] = device_ptr;
583     }
584     trt_out_tensor_name_.push_back(tensor.Name());
585   }
586   return RET_OK;
587 }
588 
SelectProfile(const std::vector<ShapeVector> & new_shapes) const589 int TensorRTSubGraph::SelectProfile(const std::vector<ShapeVector> &new_shapes) const {
590   std::vector<int> profile_index;
591   for (size_t i = 0; i < profiles_.size(); ++i) {
592     const auto &profile = trt_profile_config_.profiles[i];
593     bool condition = true;
594     for (size_t j = 0; j < trt_in_tensor_name_.size(); ++j) {
595       auto new_shape = new_shapes[j];
596       auto profile_input = profile.inputs[j];
597       if (new_shape.size() != profile_input.max_dims.size()) {
598         condition = false;
599       } else {
600         for (size_t od = 0; od < new_shape.size(); od++) {
601           if (new_shape[od] < profile_input.min_dims[od] || new_shape[od] > profile_input.max_dims[od]) {
602             condition = false;
603             break;
604           }
605         }
606       }
607     }
608     if (condition) {
609       profile_index.push_back(i);
610     }
611   }
612   return profile_index.empty() ? -1 : profile_index.front();
613 }
614 
MaxVolumnProfileIndex() const615 size_t TensorRTSubGraph::MaxVolumnProfileIndex() const {
616   int max_volumn = std::numeric_limits<int>::min();
617   size_t max_volumn_index = 0;
618   for (size_t i = 0; i < trt_profile_config_.profiles.size(); ++i) {
619     const auto &profile = trt_profile_config_.profiles[i];
620     // depend on the first input tensor
621     int64_t volumn = std::accumulate(profile.inputs[0].max_dims.begin(), profile.inputs[0].max_dims.end(), 1,
622                                      std::multiplies<int64_t>());
623     if (volumn > max_volumn) {
624       max_volumn_index = i;
625       max_volumn = volumn;
626     }
627   }
628   return max_volumn_index;
629 }
630 
GetProfileBindingIndex(const std::string & name,size_t profile_index)631 int TensorRTSubGraph::GetProfileBindingIndex(const std::string &name, size_t profile_index) {
632   std::string binding_name = name;
633   if (profile_index != 0) {
634     binding_name += " [profile " + std::to_string(profile_index) + "]";
635   }
636   return this->engine_->getBindingIndex(binding_name.c_str());
637 }
638 
OnNewInputShapes(const std::vector<ShapeVector> & new_shapes)639 int TensorRTSubGraph::OnNewInputShapes(const std::vector<ShapeVector> &new_shapes) {
640   if (inputs_.size() != new_shapes.size()) {
641     MS_LOG(ERROR) << "Graph inputs size " << inputs_.size() << " != resize input size " << new_shapes.size();
642     return RET_ERROR;
643   }
644   auto select_profile_index = SelectProfile(new_shapes);
645   if (select_profile_index < 0) {
646     MS_LOG(ERROR) << "Not support input shape " << new_shapes;
647     return RET_ERROR;
648   }
649   profile_index_ = static_cast<size_t>(select_profile_index);
650   if (this->trt_context_->setOptimizationProfile(profile_index_)) {
651     MS_LOG(INFO) << "setOptimizationProfile: " << profile_index_;
652   }
653   int batch_size = -1;
654   for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) {
655     if (inputs_[i].Shape() == new_shapes[i]) {
656       continue;
657     }
658     if (input_batchsize_index_ == -1) {
659       MS_LOG(ERROR) << "current network don't support resize.";
660       return RET_ERROR;
661     }
662     inputs_[i].SetShape(new_shapes[i]);
663     if (ctx_->network() != nullptr) {
664       for (int j = 0; j < ctx_->network()->getNbInputs(); j++) {
665         if (trt_in_tensor_name_[i].compare(ctx_->network()->getInput(j)->getName()) != 0) {
666           continue;
667         }
668         nvinfer1::Dims construct_dims = ctx_->network()->getInput(j)->getDimensions();
669         bool ret = ValidInputResizeDims(construct_dims, inputs_[i].Shape());
670         if (!ret) {
671           MS_LOG(ERROR) << "input resize shape is invalid.";
672           return RET_ERROR;
673         }
674       }
675     }
676 
677     MS_LOG(INFO) << "resize at input_batch_index " << input_batchsize_index_ << ", update batch size to "
678                  << inputs_[i].Shape()[input_batchsize_index_];
679     int new_batch_size = inputs_[i].Shape()[input_batchsize_index_];
680     if (batch_size != -1 && batch_size != new_batch_size) {
681       MS_LOG(ERROR) << "Batch size " << batch_size << " of input 0 != batch size " << new_batch_size << " of input "
682                     << i;
683       return RET_ERROR;
684     }
685     batch_size = new_batch_size;
686 
687     int index = GetProfileBindingIndex(trt_in_tensor_name_[i], profile_index_);
688     // Set actual input size
689     nvinfer1::Dims input_dims = ConvertCudaDims(inputs_[i].Shape());
690     for (int od = 0; od < input_dims.nbDims; od++) {
691       MS_LOG(DEBUG) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od];
692     }
693 
694     if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
695       MS_LOG(ERROR) << "invalid input dims of " << inputs_[i].Name() << ", profile index: " << profile_index_
696                     << ", dst dims: " << CudaDimsAsString(input_dims);
697       return RET_ERROR;
698     }
699   }
700   if (!this->trt_context_->allInputDimensionsSpecified()) {
701     MS_LOG(ERROR) << "input dims need to be specified.";
702     return RET_ERROR;
703   }
704   if (batch_size != -1) {
705     for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
706       auto index = GetProfileBindingIndex(trt_out_tensor_name_[i], profile_index_);
707       auto out_dims = trt_context_->getBindingDimensions(index);
708       DebugDims("out dims", out_dims);
709       auto new_shape = lite::ConvertMSShape(out_dims);
710       MS_LOG(INFO) << "Set output shape of " << trt_out_tensor_name_[i] << " to " << new_shape
711                    << "  by tensorrt binding output";
712       outputs_[i].SetShape(new_shape);
713     }
714   }
715   return RET_OK;
716 }
717 
VSLPreExectute(const std::vector<tensor::Tensor> & inputs,int i,bool sync,const std::string & tensor_name)718 int TensorRTSubGraph::VSLPreExectute(const std::vector<tensor::Tensor> &inputs, int i, bool sync,
719                                      const std::string &tensor_name) {
720   const bool is_expert_ids = (inputs.size() == Num6) ? Num1 : 0;
721   const int input_ids_idx = 0;
722   const int expert_ids_idx = (is_expert_ids) ? Num1 : -1;
723   const int attn_mask_idx = Num1 + is_expert_ids;
724   const int pos_ids_idx = Num2 + is_expert_ids;
725   const int current_idx_idx = Num3 + is_expert_ids;
726   if (i == input_ids_idx || i == expert_ids_idx || i == pos_ids_idx) {
727     int *in_ptr = static_cast<int *>(inputs[i].data_ptr()->data());
728     int batch = inputs[trt_in_tensor_name_.size() - Num1].ElementsNum();
729     int seq = inputs[0].ElementsNum() / batch;
730     int export_num = (i != expert_ids_idx) ? Num1 : inputs[i].ElementsNum() / (batch * seq);
731     bool incremental_mode =
732       (static_cast<const int32_t *>(inputs[pos_ids_idx].data().const_data())[0] != 0) ? true : false;
733     size_t h_token = 0;
734     for (int k = 0; k < batch; k++) {
735       int actual_seq_len =
736         (incremental_mode)
737           ? Num1
738           : (static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k] + Num1);
739       int batch_valid = static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k];
740       h_token += (batch_valid == -1) ? 0 : actual_seq_len;
741     }
742     for (int j = 0; j < export_num; j++) {
743       size_t h_token_idx = 0;
744       for (int k = 0; k < batch; k++) {
745         int actual_seq_len =
746           (incremental_mode)
747             ? Num1
748             : (static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k] + Num1);
749         for (int l = 0; l < actual_seq_len; l++) {
750           in_ptr[j * h_token + h_token_idx + l] =
751             static_cast<int *>(inputs[i].data_ptr()->data())[j * batch * seq + k * seq + l];
752         }
753         h_token_idx += actual_seq_len;
754       }
755     }
756     return runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], tensor_name, sync,
757                                                          h_token * export_num * sizeof(int));
758   } else if (i != attn_mask_idx && i != current_idx_idx) {
759     return runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], tensor_name, sync);
760   }
761   return RET_OK;
762 }
763 
PreExecute(const std::vector<tensor::Tensor> & inputs,const std::vector<tensor::Tensor> & outputs,bool sync)764 int TensorRTSubGraph::PreExecute(const std::vector<tensor::Tensor> &inputs, const std::vector<tensor::Tensor> &outputs,
765                                  bool sync) {
766   if (inputs_.size() != inputs.size()) {
767     MS_LOG(ERROR) << "Graph inputs size " << inputs_.size() << " != execute inputs size " << inputs.size();
768     return RET_ERROR;
769   }
770   if (!outputs.empty() && outputs.size() != outputs_.size()) {
771     MS_LOG(ERROR) << "Graph outputs size " << outputs_.size() << " != execute outputs size " << outputs.size();
772     return RET_ERROR;
773   }
774   std::vector<ShapeVector> new_shapes;
775   std::transform(inputs.begin(), inputs.end(), std::back_inserter(new_shapes), [](auto &t) { return t.shape_c(); });
776   auto ret = OnNewInputShapes(new_shapes);
777   if (ret != RET_OK) {
778     return ret;
779   }
780   for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) {
781     auto trt_tensor_name = trt_in_tensor_name_[i];
782     void *device_ptr = nullptr;
783     auto input_device_address = inputs[i].device_address();
784     if (input_device_address != nullptr && input_device_address->GetMutablePtr() != nullptr) {
785       device_ptr = input_device_address->GetMutablePtr();
786     } else {
787       device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_tensor_name, inputs_[i].DataSize(),
788                                                              ConvertDataType(inputs_[i].DataType()));
789       if (device_ptr == nullptr) {
790         MS_LOG(ERROR) << "realloc for input tensor device memory failed.";
791         return RET_ERROR;
792       }
793       if (runtime_->IsTransformerOptimizeSigma()) {
794         ret = VSLPreExectute(inputs, i, sync, trt_tensor_name);
795       } else {
796         ret = runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], trt_tensor_name, sync);
797       }
798       if (ret != RET_OK) {
799         MS_LOG(ERROR) << "sync mem from host to device failed for " << trt_tensor_name;
800         return RET_ERROR;
801       }
802       runtime_->GetAllocator()->MarkMemValid(trt_tensor_name, true);
803     }
804     int index = GetProfileBindingIndex(trt_tensor_name, profile_index_);
805     MS_LOG(INFO) << "device index " << index << " for tensor : " << trt_tensor_name << " attr: " << device_ptr;
806     tensor_bindings_[index] = device_ptr;
807   }
808   for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
809     const auto &trt_out_tensor_name = trt_out_tensor_name_[i];
810     int index = GetProfileBindingIndex(trt_out_tensor_name, profile_index_);
811     void *device_ptr = nullptr;
812     if (outputs.size() > i) {
813       auto &output = outputs[i];
814       if (output.device_address() && output.device_address()->GetMutablePtr()) {
815         if (output.Size() < outputs_[i].DataSize()) {
816           MS_LOG(ERROR) << "Specified output device data size " << output.Size()
817                         << " cannot less than execute output data size " << outputs_[i].DataSize()
818                         << ", output shape: " << outputs_[i].Shape();
819           return RET_ERROR;
820         }
821         device_ptr = output.device_address()->GetMutablePtr();
822       }
823     }
824     if (!device_ptr) {
825       device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_out_tensor_name, outputs_[i].DataSize(),
826                                                              ConvertDataType(outputs_[i].DataType()));
827       if (device_ptr == nullptr) {
828         MS_LOG(ERROR) << "realloc for outputs tensor device memory failed.";
829         return RET_ERROR;
830       }
831     }
832     tensor_bindings_[index] = device_ptr;
833   }
834   return RET_OK;
835 }  // namespace mindspore::lite
836 
PostExecute(std::vector<tensor::Tensor> * outputs,bool sync)837 int TensorRTSubGraph::PostExecute(std::vector<tensor::Tensor> *outputs, bool sync) {
838   if (!outputs->empty() && outputs->size() != outputs_.size()) {
839     MS_LOG(ERROR) << "Graph outputs size " << outputs_.size() << " != execute outputs size " << outputs->size();
840     return RET_ERROR;
841   }
842   auto has_outputs = !outputs->empty();
843   for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
844     const auto &trt_out_tensor_name = trt_out_tensor_name_[i];
845     auto index = GetProfileBindingIndex(trt_out_tensor_name, profile_index_);
846     // actual output tensor dims
847     auto out_dims = this->trt_context_->getBindingDimensions(index);
848     std::vector<int64_t> new_shape = lite::ConvertMSShape(out_dims);
849     for (int od = 0; od < out_dims.nbDims; od++) {
850       MS_LOG(DEBUG) << "out tensor " << trt_out_tensor_name << " dims at " << od << " is " << new_shape[od];
851     }
852     runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name, true);
853     if (has_outputs) {
854       auto &tensor = outputs->at(i);
855       auto dst_device = tensor.device_address();
856       if (dst_device == nullptr || dst_device->GetMutablePtr() == nullptr) {
857         if (tensor.Size() < outputs_[i].DataSize()) {
858           MS_LOG(ERROR) << "Specified output host data size " << tensor.Size()
859                         << " cannot less than execute output data size " << outputs_[i].DataSize()
860                         << ", output shape: " << new_shape;
861           return RET_ERROR;
862         }
863         auto host_address = tensor.data_c();
864         if (host_address == nullptr) {
865           MS_LOG(ERROR) << "Specified output device or host address cannot be nullptr";
866           return RET_ERROR;
867         }
868         int sync_ret = runtime_->GetAllocator()->SyncMemDeviceToHost(host_address, outputs_[i].DataSize(),
869                                                                      trt_out_tensor_name, sync);
870         if (sync_ret != RET_OK) {
871           MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name;
872           return sync_ret;
873         }
874       }
875     } else {
876       tensor::Tensor output_tensor(static_cast<enum TypeId>(outputs_[i].DataType()), new_shape);
877       int sync_ret = runtime_->GetAllocator()->SyncMemDeviceToHost(&output_tensor, trt_out_tensor_name, sync);
878       if (sync_ret != RET_OK) {
879         MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name;
880         return sync_ret;
881       }
882       outputs->push_back(output_tensor);
883     }
884     runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name, false);
885   }
886   // make mem invalid, prepare for next execute
887   for (size_t i = 0; i < inputs_.size(); i++) {
888     runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], false);
889   }
890   return RET_OK;
891 }
892 
ValidInputResizeDims(const nvinfer1::Dims & construct_dims,const std::vector<int64_t> & resize_input_shape)893 bool TensorRTSubGraph::ValidInputResizeDims(const nvinfer1::Dims &construct_dims,
894                                             const std::vector<int64_t> &resize_input_shape) {
895   if (static_cast<size_t>(construct_dims.nbDims) != resize_input_shape.size()) {
896     MS_LOG(ERROR) << "invalid resize input.";
897     return false;
898   }
899   return true;
900 }
901 
Execute(const std::vector<tensor::Tensor> & inputs,std::vector<tensor::Tensor> * outputs)902 int TensorRTSubGraph::Execute(const std::vector<tensor::Tensor> &inputs, std::vector<tensor::Tensor> *outputs) {
903 #ifdef ASYNC_INFER
904   bool sync = false;
905 #else
906   bool sync = true;
907 #endif
908   int ret = lite::SetCudaDevice(device_info_);
909   if (ret != RET_OK) {
910     return ret;
911   }
912   ret = PreExecute(inputs, *outputs, sync);
913   if (ret != RET_OK) {
914     return ret;
915   }
916   if (sync) {
917     if (!this->trt_context_->executeV2(tensor_bindings_)) {
918       MS_LOG(ERROR) << "TensorRT execute failed.";
919       return RET_ERROR;
920     }
921   } else {
922     if (!this->trt_context_->enqueueV2(tensor_bindings_, stream_, nullptr)) {
923       MS_LOG(ERROR) << "TensorRT execute failed.";
924       return RET_ERROR;
925     }
926   }
927   ret = PostExecute(outputs, sync);
928   if (ret != RET_OK) {
929     return ret;
930   }
931   if (!sync) {
932     cudaStreamSynchronize(stream_);
933   }
934   return RET_OK;
935 }
936 
Resize(const std::vector<tensor::Tensor> &,const std::vector<ShapeVector> & new_shapes)937 int TensorRTSubGraph::Resize(const std::vector<tensor::Tensor> &, const std::vector<ShapeVector> &new_shapes) {
938   return OnNewInputShapes(new_shapes);
939 }
940 
FindTensorRTInputs(TensorRTOp * cur_op,const TensorInfo & in_tensor)941 ITensorHelper TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const TensorInfo &in_tensor) {
942   for (auto input_op : cur_op->in_ops()) {
943     for (size_t i = 0; i < input_op->outputs().size(); i++) {
944       auto out_tensor = input_op->outputs().at(i);
945       if (in_tensor.Name().compare(out_tensor.Name()) == 0) {
946         return input_op->output(ctx_, i);
947       }
948     }
949   }
950   return ITensorHelper{};
951 }
952 }  // namespace mindspore::lite
953