1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/extendrt/delegate/tensorrt/tensorrt_subgraph.h"
18 #include <cuda_runtime_api.h>
19 #include <string>
20 #include <vector>
21 #include <set>
22 #include <queue>
23 #include <algorithm>
24 #include <numeric>
25 #include <functional>
26 #include <fstream>
27 #include <limits>
28 #include <unordered_map>
29 #include <iomanip>
30 #include "src/extendrt/delegate/delegate_utils.h"
31 #include "src/extendrt/delegate/tensorrt/tensorrt_utils.h"
32 #include "src/common/utils.h"
33 #include "ops/auto_generate/gen_lite_ops.h"
34 #include "ops/fusion/topk_fusion.h"
35
36 namespace mindspore::lite {
TensorRTSubGraph(std::vector<TensorRTOp * > ops,const std::vector<TensorInfo> & inputs,const std::vector<TensorInfo> & outputs,const mindspore::Context * ctx,std::shared_ptr<GPUDeviceInfo> device_info,TensorRTRuntime * runtime,bool support_resize,bool support_hw_resize,const ProfileConfigs & trt_profile_config)37 TensorRTSubGraph::TensorRTSubGraph(std::vector<TensorRTOp *> ops, const std::vector<TensorInfo> &inputs,
38 const std::vector<TensorInfo> &outputs, const mindspore::Context *ctx,
39 std::shared_ptr<GPUDeviceInfo> device_info, TensorRTRuntime *runtime,
40 bool support_resize, bool support_hw_resize,
41 const ProfileConfigs &trt_profile_config)
42 : inputs_(inputs),
43 outputs_(outputs),
44 all_ops_(std::move(ops)),
45 device_info_(device_info),
46 runtime_(runtime),
47 trt_profile_config_(trt_profile_config) {
48 trt_specific_weight_handled_inner_ = {
49 ops::kNameTranspose, ops::kNameReshape, ops::kNameExpandDims, ops::kNameTopKFusion, ops::kNameBroadcastTo,
50 };
51 if (!support_resize) {
52 input_batchsize_index_ = -1;
53 input_hw_index_ = -1;
54 }
55 if (!support_hw_resize) {
56 input_hw_index_ = -1;
57 }
58 }
59
~TensorRTSubGraph()60 TensorRTSubGraph::~TensorRTSubGraph() {
61 if (ctx_ != nullptr) {
62 delete ctx_;
63 }
64 if (config_ != nullptr) {
65 config_->destroy();
66 config_ = nullptr;
67 }
68 #ifdef PROFILER_
69 auto profile = dynamic_cast<SimpleProfiler *>(trt_context_->getProfiler());
70 if (profile != nullptr) std::cout << *profile << std::endl;
71 delete profile;
72 #endif
73 if (trt_context_ != nullptr) {
74 trt_context_->destroy();
75 trt_context_ = nullptr;
76 }
77 if (engine_ != nullptr) {
78 engine_->destroy();
79 engine_ = nullptr;
80 }
81 if (tensor_bindings_ != nullptr) {
82 delete[] tensor_bindings_;
83 tensor_bindings_ = nullptr;
84 }
85 for (auto op : all_ops_) {
86 delete op;
87 }
88 }
89
IsValidProfileDims() const90 bool TensorRTSubGraph::IsValidProfileDims() const {
91 if (trt_profile_config_.profiles.empty()) {
92 MS_LOG(INFO) << "Number of profiles is 0.";
93 return false;
94 }
95 for (auto &profile : trt_profile_config_.profiles) {
96 if (profile.inputs.size() != trt_profile_config_.input_infos.size()) {
97 MS_LOG(WARNING) << "Profile input size " << profile.inputs.size() << " != input shape size "
98 << trt_profile_config_.input_infos.size();
99 return false;
100 }
101 for (size_t i = 0; i < profile.inputs.size(); i++) {
102 const auto &profile_input = profile.inputs[i];
103 const auto &input_info = trt_profile_config_.input_infos[i];
104 if (profile_input.min_dims.size() != input_info.input_shape.size()) {
105 MS_LOG(WARNING) << "Profile input " << input_info.name << " min dims number " << profile_input.min_dims.size()
106 << " != input shape dim number " << input_info.input_shape.size();
107 return false;
108 }
109 if (profile_input.max_dims.size() != input_info.input_shape.size()) {
110 MS_LOG(WARNING) << "Profile input " << input_info.name << " max dims number " << profile_input.max_dims.size()
111 << " != input shape dim number " << input_info.input_shape.size();
112 return false;
113 }
114 if (profile_input.opt_dims.size() != input_info.input_shape.size()) {
115 MS_LOG(WARNING) << "Profile input " << input_info.name << " opt dims number " << profile_input.opt_dims.size()
116 << " != input shape dim number " << input_info.input_shape.size();
117 return false;
118 }
119 }
120 }
121 return true;
122 }
123
Init(cudaStream_t stream,cublasHandle_t cublas_handle,cublasLtHandle_t cublaslt_handle)124 int TensorRTSubGraph::Init(cudaStream_t stream, cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle) {
125 auto ret = GetGraphInOutOps(inputs_, outputs_, &in_ops_, &out_ops_, all_ops_);
126 if (ret != RET_OK) {
127 MS_LOG(ERROR) << "Get TensorRT subgraph input and output ops failed.";
128 return RET_ERROR;
129 }
130 ctx_ = new (std::nothrow) TensorRTContext();
131 if (ctx_ == nullptr) {
132 MS_LOG(ERROR) << "New TensorRTContext failed.";
133 return RET_ERROR;
134 }
135 ctx_->SetRuntime(runtime_);
136 if (!ctx_->Init()) {
137 MS_LOG(ERROR) << "New TensorRTContext failed.";
138 return RET_ERROR;
139 }
140 if (SetDeviceConfig(stream, cublas_handle, cublaslt_handle) != RET_OK) {
141 MS_LOG(WARNING) << "set tensorrt config failed.";
142 }
143 serializer_ = std::make_shared<TensorRTSerializer>(serialize_file_path_);
144 if (serializer_ == nullptr) {
145 MS_LOG(ERROR) << "create Serializer failed.";
146 return RET_ERROR;
147 }
148 using_input_ranges_ = IsValidProfileDims();
149 if (using_input_ranges_) {
150 for (size_t i = 0; i != trt_profile_config_.profiles.size(); ++i) {
151 profiles_.push_back(runtime_->GetBuilder()->createOptimizationProfile());
152 }
153 } else {
154 profiles_.push_back(runtime_->GetBuilder()->createOptimizationProfile());
155 }
156 for (size_t i = 0; i != profiles_.size(); ++i) {
157 if (profiles_[i] == nullptr) {
158 MS_LOG(ERROR) << "create optimization profile failed.";
159 return RET_ERROR;
160 }
161 }
162 engine_ = serializer_->GetSerializedEngine();
163 if (engine_ != nullptr) {
164 MS_LOG(INFO) << "using serialized engine " << serialize_file_path_;
165 return RET_OK;
166 }
167 for (size_t i = 0; i < inputs_.size(); i++) {
168 if (inputs_[i].Shape().size() != DIMENSION_4D) {
169 input_hw_index_ = -1;
170 }
171 }
172 return RET_OK;
173 }
174
BuildEngine()175 int TensorRTSubGraph::BuildEngine() {
176 // print all network ops
177 for (auto &profile : profiles_) {
178 if (this->config_->addOptimizationProfile(profile) == -1) {
179 MS_LOG(ERROR) << "addOptimizationProfile failed.";
180 return RET_ERROR;
181 }
182 }
183 MS_LOG(INFO) << "build engine for tensorrt network: " << ctx_->network()->getName();
184 for (int i = 0; i < ctx_->network()->getNbLayers(); i++) {
185 MS_LOG(DEBUG) << "tensorrt op: " << ctx_->network()->getLayer(i)->getName();
186 }
187 MS_LOG(DEBUG) << "end of tensorrt network: " << ctx_->network()->getName();
188
189 this->engine_ = runtime_->GetBuilder()->buildEngineWithConfig(*ctx_->network(), *this->config_);
190 if (this->engine_ == nullptr) {
191 MS_LOG(ERROR) << "Create engine failed in TensorRT network";
192 return RET_ERROR;
193 }
194 if (serialize_file_path_.size() > 0) {
195 serializer_->SaveSerializedEngine(engine_);
196 }
197 return RET_OK;
198 }
199
SetDeviceConfig(cudaStream_t stream,cublasHandle_t cublas_handle,cublasLtHandle_t cublaslt_handle)200 int TensorRTSubGraph::SetDeviceConfig(cudaStream_t stream, cublasHandle_t cublas_handle,
201 cublasLtHandle_t cublaslt_handle) {
202 if (config_ == nullptr) {
203 this->config_ = runtime_->GetBuilder()->createBuilderConfig();
204 if (this->config_ == nullptr) {
205 MS_LOG(ERROR) << "create builder config failed.";
206 return RET_ERROR;
207 }
208 }
209 // set fp16
210 if (device_info_->GetEnableFP16() && runtime_->GetBuilder()->platformHasFastFp16()) {
211 MS_LOG(INFO) << "set fp16 flag successfully for tensorrt.";
212 config_->setFlag(nvinfer1::BuilderFlag::kFP16);
213 runtime_->SetRuntimePrecisionMode(RuntimePrecisionMode_FP16);
214 }
215
216 // set int8
217 if (IsInt8Mode() && runtime_->GetBuilder()->platformHasFastInt8()) {
218 MS_LOG(INFO) << "set int8 flag successfully for tensorrt.";
219 config_->setFlag(nvinfer1::BuilderFlag::kINT8);
220 // Mark calibrator as null
221 config_->setInt8Calibrator(nullptr);
222 input_hw_index_ = -1;
223 } else {
224 MS_LOG(INFO) << "inputs no quant params or platform not support int8.";
225 }
226 runtime_->SetCudaStream(stream, cublas_handle, cublaslt_handle);
227 config_->setProfileStream(stream);
228 stream_ = stream;
229
230 MS_LOG(INFO) << GetRankID() << " tensorrt subgraph stream: " << stream_;
231
232 // config setMaxWorkspaceSize to 2100 MB for max limit
233 constexpr size_t kWorkspaceSize = static_cast<size_t>(2100) * (1 << 20);
234 config_->setMaxWorkspaceSize(kWorkspaceSize);
235 return RET_OK;
236 }
237
IsInt8Mode()238 bool TensorRTSubGraph::IsInt8Mode() {
239 for (auto cur_op : all_ops_) {
240 if (cur_op->GetQuantType() == schema::QuantType_QUANT_ALL) {
241 return true;
242 }
243 }
244 return false;
245 }
246
SetTensorRTNetworkInput(const TensorInfo & in_tensor,int index)247 nvinfer1::ITensor *TensorRTSubGraph::SetTensorRTNetworkInput(const TensorInfo &in_tensor, int index) {
248 if (index < 0) {
249 return nullptr;
250 }
251 for (int i = 0; i < ctx_->network()->getNbInputs(); i++) {
252 if (in_tensor.Name().compare(ctx_->network()->getInput(i)->getName()) == 0) {
253 MS_LOG(INFO) << "input tensor is already added in network: " << in_tensor.Name();
254 return ctx_->network()->getInput(i);
255 }
256 }
257
258 auto cuda_dtype = ConvertDataType(in_tensor.DataType());
259 if (static_cast<int>(cuda_dtype) == -1) {
260 MS_LOG(ERROR) << "Unsupported input data type " << static_cast<int>(in_tensor.DataType());
261 return nullptr;
262 }
263 nvinfer1::Dims input_dims;
264 if (using_input_ranges_) {
265 input_dims = SetInputDimsProfile(in_tensor, index);
266 } else {
267 input_dims = ParseInputDimsProfile(in_tensor, index);
268 }
269 MS_LOG(INFO) << "add network input: " << in_tensor.Name();
270 return ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims);
271 }
272
SetInputDimsProfile(const TensorInfo & in_tensor,int index)273 nvinfer1::Dims TensorRTSubGraph::SetInputDimsProfile(const TensorInfo &in_tensor, int index) {
274 auto input_info = trt_profile_config_.input_infos[index];
275 auto input_dims = ConvertCudaDims(input_info.input_shape);
276 DebugDims("input dims", input_dims);
277 for (size_t i = 0; i < trt_profile_config_.profiles.size(); i++) {
278 auto &profile = trt_profile_config_.profiles[i];
279 auto min_dims = ConvertCudaDims(profile.inputs[index].min_dims);
280 if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kMIN, min_dims)) {
281 MS_LOG(ERROR) << "setDimensions of kMIN failed for " << input_info.name;
282 return input_dims;
283 }
284 auto opt_dims = ConvertCudaDims(profile.inputs[index].opt_dims);
285 if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kOPT, opt_dims)) {
286 MS_LOG(ERROR) << "setDimensions of kOPT failed for " << input_info.name;
287 return input_dims;
288 }
289
290 auto max_dims = ConvertCudaDims(profile.inputs[index].max_dims);
291 if (!profiles_[i]->setDimensions(input_info.name.c_str(), nvinfer1::OptProfileSelector::kMAX, max_dims)) {
292 MS_LOG(ERROR) << "setDimensions of kMAX failed for " << input_info.name;
293 return input_dims;
294 }
295 DebugDims("min dims", min_dims);
296 DebugDims("opt dims", opt_dims);
297 DebugDims("max dims", max_dims);
298 }
299 return input_dims;
300 }
301
ParseInputDimsProfile(const TensorInfo & in_tensor,int index)302 nvinfer1::Dims TensorRTSubGraph::ParseInputDimsProfile(const TensorInfo &in_tensor, int index) {
303 nvinfer1::Dims input_dims = ConvertCudaDims(in_tensor.Shape());
304 nvinfer1::Dims input_dims_min = ConvertCudaDims(in_tensor.Shape());
305 if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims_min)) {
306 MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name();
307 return input_dims;
308 }
309 nvinfer1::Dims input_dims_opt = ConvertCudaDims(in_tensor.Shape());
310 if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims_opt)) {
311 MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name();
312 return input_dims;
313 }
314 nvinfer1::Dims input_dims_max = ConvertCudaDims(in_tensor.Shape());
315 // input_dims_max should be the same with input network dims
316 if (!profiles_.front()->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims_max)) {
317 MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name();
318 return input_dims;
319 }
320 if (trt_profile_config_.profiles.empty()) {
321 ProfileItem profile_item;
322 profile_item.inputs.resize(inputs_.size());
323 trt_profile_config_.profiles.push_back(profile_item);
324 }
325 auto &profile_item = trt_profile_config_.profiles.back();
326 profile_item.inputs[index].min_dims = ConvertMSShape(input_dims_min);
327 profile_item.inputs[index].opt_dims = ConvertMSShape(input_dims_opt);
328 profile_item.inputs[index].max_dims = ConvertMSShape(input_dims_max);
329
330 DebugDims("input min dims", input_dims_min);
331 DebugDims("input opt dims", input_dims_opt);
332 DebugDims("input max dims", input_dims_max);
333 return input_dims;
334 }
335
ParseInputsProfile()336 int TensorRTSubGraph::ParseInputsProfile() {
337 MS_LOG(INFO) << "using serialied engine.";
338 for (size_t i = 0; i < inputs_.size(); i++) {
339 auto dim = ParseInputDimsProfile(inputs_[i], i);
340 if (dim.nbDims <= 0) {
341 MS_LOG(ERROR) << "input dims is invalid.";
342 return RET_ERROR;
343 }
344 }
345 return RET_OK;
346 }
347
GetInputIndexByName(const std::string & name)348 int TensorRTSubGraph::GetInputIndexByName(const std::string &name) {
349 for (size_t i = 0; i != inputs().size(); ++i) {
350 if (inputs()[i].Name() == name) {
351 return i;
352 }
353 }
354 return -1;
355 }
356
BuildTensorRTGraph()357 int TensorRTSubGraph::BuildTensorRTGraph() {
358 MS_ASSERT(!all_ops_.empty());
359 int ret;
360 if (engine_ != nullptr) {
361 return ParseInputsProfile();
362 }
363 // build engine online
364 for (auto cur_op : all_ops_) {
365 cur_op->SetRuntime(runtime_);
366 for (size_t i = 0; i != cur_op->inputs().size(); ++i) {
367 // Data From CPU
368 auto in_tensor = cur_op->inputs()[i];
369 if (IsSubGraphInputTensor(this->inputs(), in_tensor)) {
370 nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(in_tensor, GetInputIndexByName(in_tensor.Name()));
371 if (trt_tensor == nullptr) {
372 MS_LOG(ERROR) << "SetTensorRTNetworkInput failed for " << in_tensor.Name();
373 return RET_ERROR;
374 }
375 // avoid bool input tensor
376 cur_op->SetSupportInputBool(false);
377
378 ctx_->RegisterTensorWithSameName(ITensorHelper{trt_tensor, in_tensor.format(), true}, in_tensor.Name());
379 continue;
380 }
381
382 ITensorHelper trt_tensor = FindTensorRTInputs(cur_op, in_tensor);
383 if (trt_tensor.trt_tensor_ == nullptr) {
384 // weight tensor
385 auto weight_handled_inner =
386 cur_op->IsWeightInputHanledInner() ||
387 trt_specific_weight_handled_inner_.find(cur_op->type()) != trt_specific_weight_handled_inner_.end();
388 if (!weight_handled_inner) {
389 if (!in_tensor.IsConst()) {
390 MS_LOG(ERROR) << "Weight Tensor data is not const.";
391 return RET_ERROR;
392 }
393 trt_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx_, in_tensor, cur_op->GetOpName());
394 trt_tensor.format_ = Format::NCHW;
395 MS_LOG(INFO) << "auto convert constant tensor for: " << in_tensor.Name();
396 ctx_->RegisterTensor(trt_tensor, in_tensor.Name());
397 }
398 } else {
399 ctx_->RegisterTensor(trt_tensor, in_tensor.Name());
400 }
401 }
402 MS_LOG(DEBUG) << "Parsing TensorRT op for " << cur_op->GetOpName();
403
404 ret = cur_op->AddInnerOp(ctx_);
405 if (ret != RET_OK) {
406 MS_LOG(ERROR) << "Add op failed in TensorRT network: " << cur_op->GetOpName();
407 return RET_ERROR;
408 }
409 ret = cur_op->SetInt8DynamicRange(ctx_);
410 if (ret != RET_OK) {
411 MS_LOG(ERROR) << "Set Int8 dynamic range failed in TensorRT network: " << cur_op->GetOpName();
412 return RET_ERROR;
413 }
414 }
415 ret = MarkOutputs();
416 if (ret != RET_OK) {
417 MS_LOG(ERROR) << "MarkOutputs failed in TensorRT network";
418 return ret;
419 }
420
421 std::string network_name = "network_" + std::string(ctx_->network()->getInput(0)->getName()) + "_" +
422 std::string(ctx_->network()->getOutput(0)->getName());
423 ctx_->network()->setName(network_name.c_str());
424 this->name_ = network_name;
425 ret = BuildEngine();
426 if (ret != RET_OK) {
427 MS_LOG(ERROR) << "Create engine failed in TensorRT network";
428 return ret;
429 }
430 return RET_OK;
431 }
432
MarkOutputs()433 int TensorRTSubGraph::MarkOutputs() {
434 // Mark NetWork Output Tensor.
435 for (const auto &out_tensor : outputs_) {
436 std::string output_name = out_tensor.Name();
437 auto input_it = std::find_if(inputs_.begin(), inputs_.end(),
438 [=](const TensorInfo &input) { return input.Name() == output_name; });
439 if (input_it != inputs_.end()) {
440 nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(*input_it, GetInputIndexByName(input_it->Name()));
441 ctx_->network()->markOutput(*trt_tensor);
442 continue;
443 }
444 if (out_tensor.IsConst()) {
445 MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
446 auto output_helper = ctx_->MsName2Tensor(out_tensor.Name());
447 if (output_helper.trt_tensor_ == nullptr) {
448 output_helper.trt_tensor_ = lite::ConvertConstantTensor(ctx_, out_tensor, out_tensor.Name());
449 output_helper.format_ = Format::NCHW;
450 MS_LOG(INFO) << "auto convert constant tensor for: " << out_tensor.Name();
451 ctx_->RegisterTensor(output_helper, out_tensor.Name());
452 }
453 nvinfer1::ITensor *out_trt_tensor = output_helper.trt_tensor_;
454 out_trt_tensor->setName(("__" + out_tensor.Name()).c_str());
455 out_trt_tensor = ctx_->network()->addIdentity(*out_trt_tensor)->getOutput(0);
456 out_trt_tensor->setName(out_tensor.Name().c_str());
457 ctx_->network()->markOutput(*out_trt_tensor);
458 for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
459 if (out_trt_tensor->getDimensions().d[n] == -1) {
460 output_batchsize_index_ = n;
461 break;
462 }
463 }
464 }
465 for (auto out_op : this->out_ops_) {
466 for (size_t index = 0; index < out_op->outputs().size(); index++) {
467 if (out_op->outputs()[index] == out_tensor) {
468 MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
469 auto output_helper = out_op->output(ctx_, index);
470 nvinfer1::ITensor *out_trt_tensor = output_helper.trt_tensor_;
471 out_trt_tensor->setName(("__" + out_tensor.Name()).c_str());
472 auto out_layer = ctx_->network()->addIdentity(*out_trt_tensor);
473 if (out_tensor.DataType() == DataType::kNumberTypeFloat16) {
474 MS_LOG(WARNING) << "Cast output tensor " << out_tensor.Name() << " to fp16";
475 out_layer->setOutputType(0, nvinfer1::DataType::kHALF);
476 }
477 out_trt_tensor = out_layer->getOutput(0);
478 out_trt_tensor->setName(out_tensor.Name().c_str());
479 ctx_->network()->markOutput(*out_trt_tensor);
480 for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
481 if (out_trt_tensor->getDimensions().d[n] == -1) {
482 output_batchsize_index_ = n;
483 break;
484 }
485 }
486 }
487 }
488 }
489 }
490 return RET_OK;
491 }
492
Prepare()493 int TensorRTSubGraph::Prepare() {
494 int ret = lite::SetCudaDevice(device_info_);
495 if (ret != RET_OK) {
496 return ret;
497 }
498 if (this->engine_ == nullptr) {
499 MS_LOG(ERROR) << "engine_ is null in this builder_";
500 return RET_ERROR;
501 }
502 this->trt_context_ = this->engine_->createExecutionContext();
503 if (this->trt_context_ == nullptr) {
504 MS_LOG(ERROR) << "TensorRTSubGraph create context failed.";
505 return RET_ERROR;
506 }
507
508 #ifdef PROFILER_
509 auto profiler = new SimpleProfiler("myprofiler");
510 if (profiler == nullptr) {
511 MS_LOG(WARNING) << "Cannot create profiler";
512 }
513 this->trt_context_->setProfiler(profiler);
514 #endif
515
516 int binding_num = this->engine_->getNbBindings();
517 if (binding_num <= 0) {
518 MS_LOG(ERROR) << "TensorRTSubGraph binding num < 0.";
519 return RET_ERROR;
520 }
521 tensor_bindings_ = new (std::nothrow) void *[binding_num];
522 if (tensor_bindings_ == nullptr) {
523 MS_LOG(ERROR) << "malloc tensor binding array failed.";
524 return RET_ERROR;
525 }
526 profile_index_ = MaxVolumnProfileIndex();
527 if (this->trt_context_->setOptimizationProfile(profile_index_)) {
528 MS_LOG(INFO) << "setOptimizationProfile: " << profile_index_;
529 }
530 const auto &profile = trt_profile_config_.profiles[profile_index_];
531 for (size_t i = 0; i != inputs_.size(); ++i) {
532 auto &tensor = inputs_[i];
533 auto max_profile_dims = profile.inputs[i].max_dims;
534 tensor.SetShape(max_profile_dims);
535 int volumn = std::accumulate(max_profile_dims.begin(), max_profile_dims.end(), 1, std::multiplies<int>());
536 auto type_size = lite::DataTypeSize(static_cast<enum TypeId>(tensor.DataType()));
537 auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, volumn * type_size);
538 if (device_ptr == nullptr) {
539 MS_LOG(ERROR) << "malloc for inputs tensor device memory failed.";
540 return RET_ERROR;
541 }
542 auto tensor_name = tensor.Name();
543 trt_in_tensor_name_.push_back(tensor_name);
544 int index = GetProfileBindingIndex(tensor_name, profile_index_);
545 MS_LOG(INFO) << "device index " << index << " for tensor : " << tensor_name << " attr: " << device_ptr;
546 tensor_bindings_[index] = device_ptr;
547 nvinfer1::Dims input_dims = ConvertCudaDims(profile.inputs[i].max_dims);
548 if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
549 MS_LOG(ERROR) << "invalid input dims of " << tensor.Name();
550 return RET_ERROR;
551 }
552 }
553 if (!this->trt_context_->allInputDimensionsSpecified()) {
554 MS_LOG(ERROR) << "input dims need to be specified.";
555 return RET_ERROR;
556 }
557 for (auto op : all_ops_) {
558 ret = op->Prepare(tensor_bindings_, engine_);
559 if (ret != RET_OK) {
560 MS_LOG(ERROR) << "prepare op failed of " << op->GetOpName();
561 return RET_ERROR;
562 }
563 }
564 for (auto &tensor : outputs_) {
565 int max_index = GetProfileBindingIndex(tensor.Name(), profile_index_);
566 auto out_dims = trt_context_->getBindingDimensions(max_index);
567 int elem_num = std::accumulate(out_dims.d, out_dims.d + out_dims.nbDims, 1, std::multiplies<int>());
568 DebugDims("out dims", out_dims);
569 MS_LOG(INFO) << "Set output shape by tensorrt binding output";
570 tensor.SetShape(lite::ConvertMSShape(out_dims));
571 auto type_size = lite::DataTypeSize(static_cast<enum TypeId>(tensor.DataType()));
572 if (tensor.DataType() == DataType::kNumberTypeBool) {
573 type_size = lite::DataTypeSize(static_cast<enum TypeId>(DataType::kNumberTypeInt32));
574 }
575 auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, elem_num * type_size);
576 if (device_ptr == nullptr) {
577 MS_LOG(ERROR) << "malloc for outputs tensor device memory failed.";
578 return RET_ERROR;
579 }
580 for (size_t j = 0; j != profiles_.size(); ++j) {
581 int index = GetProfileBindingIndex(tensor.Name(), j);
582 tensor_bindings_[index] = device_ptr;
583 }
584 trt_out_tensor_name_.push_back(tensor.Name());
585 }
586 return RET_OK;
587 }
588
SelectProfile(const std::vector<ShapeVector> & new_shapes) const589 int TensorRTSubGraph::SelectProfile(const std::vector<ShapeVector> &new_shapes) const {
590 std::vector<int> profile_index;
591 for (size_t i = 0; i < profiles_.size(); ++i) {
592 const auto &profile = trt_profile_config_.profiles[i];
593 bool condition = true;
594 for (size_t j = 0; j < trt_in_tensor_name_.size(); ++j) {
595 auto new_shape = new_shapes[j];
596 auto profile_input = profile.inputs[j];
597 if (new_shape.size() != profile_input.max_dims.size()) {
598 condition = false;
599 } else {
600 for (size_t od = 0; od < new_shape.size(); od++) {
601 if (new_shape[od] < profile_input.min_dims[od] || new_shape[od] > profile_input.max_dims[od]) {
602 condition = false;
603 break;
604 }
605 }
606 }
607 }
608 if (condition) {
609 profile_index.push_back(i);
610 }
611 }
612 return profile_index.empty() ? -1 : profile_index.front();
613 }
614
MaxVolumnProfileIndex() const615 size_t TensorRTSubGraph::MaxVolumnProfileIndex() const {
616 int max_volumn = std::numeric_limits<int>::min();
617 size_t max_volumn_index = 0;
618 for (size_t i = 0; i < trt_profile_config_.profiles.size(); ++i) {
619 const auto &profile = trt_profile_config_.profiles[i];
620 // depend on the first input tensor
621 int64_t volumn = std::accumulate(profile.inputs[0].max_dims.begin(), profile.inputs[0].max_dims.end(), 1,
622 std::multiplies<int64_t>());
623 if (volumn > max_volumn) {
624 max_volumn_index = i;
625 max_volumn = volumn;
626 }
627 }
628 return max_volumn_index;
629 }
630
GetProfileBindingIndex(const std::string & name,size_t profile_index)631 int TensorRTSubGraph::GetProfileBindingIndex(const std::string &name, size_t profile_index) {
632 std::string binding_name = name;
633 if (profile_index != 0) {
634 binding_name += " [profile " + std::to_string(profile_index) + "]";
635 }
636 return this->engine_->getBindingIndex(binding_name.c_str());
637 }
638
OnNewInputShapes(const std::vector<ShapeVector> & new_shapes)639 int TensorRTSubGraph::OnNewInputShapes(const std::vector<ShapeVector> &new_shapes) {
640 if (inputs_.size() != new_shapes.size()) {
641 MS_LOG(ERROR) << "Graph inputs size " << inputs_.size() << " != resize input size " << new_shapes.size();
642 return RET_ERROR;
643 }
644 auto select_profile_index = SelectProfile(new_shapes);
645 if (select_profile_index < 0) {
646 MS_LOG(ERROR) << "Not support input shape " << new_shapes;
647 return RET_ERROR;
648 }
649 profile_index_ = static_cast<size_t>(select_profile_index);
650 if (this->trt_context_->setOptimizationProfile(profile_index_)) {
651 MS_LOG(INFO) << "setOptimizationProfile: " << profile_index_;
652 }
653 int batch_size = -1;
654 for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) {
655 if (inputs_[i].Shape() == new_shapes[i]) {
656 continue;
657 }
658 if (input_batchsize_index_ == -1) {
659 MS_LOG(ERROR) << "current network don't support resize.";
660 return RET_ERROR;
661 }
662 inputs_[i].SetShape(new_shapes[i]);
663 if (ctx_->network() != nullptr) {
664 for (int j = 0; j < ctx_->network()->getNbInputs(); j++) {
665 if (trt_in_tensor_name_[i].compare(ctx_->network()->getInput(j)->getName()) != 0) {
666 continue;
667 }
668 nvinfer1::Dims construct_dims = ctx_->network()->getInput(j)->getDimensions();
669 bool ret = ValidInputResizeDims(construct_dims, inputs_[i].Shape());
670 if (!ret) {
671 MS_LOG(ERROR) << "input resize shape is invalid.";
672 return RET_ERROR;
673 }
674 }
675 }
676
677 MS_LOG(INFO) << "resize at input_batch_index " << input_batchsize_index_ << ", update batch size to "
678 << inputs_[i].Shape()[input_batchsize_index_];
679 int new_batch_size = inputs_[i].Shape()[input_batchsize_index_];
680 if (batch_size != -1 && batch_size != new_batch_size) {
681 MS_LOG(ERROR) << "Batch size " << batch_size << " of input 0 != batch size " << new_batch_size << " of input "
682 << i;
683 return RET_ERROR;
684 }
685 batch_size = new_batch_size;
686
687 int index = GetProfileBindingIndex(trt_in_tensor_name_[i], profile_index_);
688 // Set actual input size
689 nvinfer1::Dims input_dims = ConvertCudaDims(inputs_[i].Shape());
690 for (int od = 0; od < input_dims.nbDims; od++) {
691 MS_LOG(DEBUG) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od];
692 }
693
694 if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
695 MS_LOG(ERROR) << "invalid input dims of " << inputs_[i].Name() << ", profile index: " << profile_index_
696 << ", dst dims: " << CudaDimsAsString(input_dims);
697 return RET_ERROR;
698 }
699 }
700 if (!this->trt_context_->allInputDimensionsSpecified()) {
701 MS_LOG(ERROR) << "input dims need to be specified.";
702 return RET_ERROR;
703 }
704 if (batch_size != -1) {
705 for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
706 auto index = GetProfileBindingIndex(trt_out_tensor_name_[i], profile_index_);
707 auto out_dims = trt_context_->getBindingDimensions(index);
708 DebugDims("out dims", out_dims);
709 auto new_shape = lite::ConvertMSShape(out_dims);
710 MS_LOG(INFO) << "Set output shape of " << trt_out_tensor_name_[i] << " to " << new_shape
711 << " by tensorrt binding output";
712 outputs_[i].SetShape(new_shape);
713 }
714 }
715 return RET_OK;
716 }
717
VSLPreExectute(const std::vector<tensor::Tensor> & inputs,int i,bool sync,const std::string & tensor_name)718 int TensorRTSubGraph::VSLPreExectute(const std::vector<tensor::Tensor> &inputs, int i, bool sync,
719 const std::string &tensor_name) {
720 const bool is_expert_ids = (inputs.size() == Num6) ? Num1 : 0;
721 const int input_ids_idx = 0;
722 const int expert_ids_idx = (is_expert_ids) ? Num1 : -1;
723 const int attn_mask_idx = Num1 + is_expert_ids;
724 const int pos_ids_idx = Num2 + is_expert_ids;
725 const int current_idx_idx = Num3 + is_expert_ids;
726 if (i == input_ids_idx || i == expert_ids_idx || i == pos_ids_idx) {
727 int *in_ptr = static_cast<int *>(inputs[i].data_ptr()->data());
728 int batch = inputs[trt_in_tensor_name_.size() - Num1].ElementsNum();
729 int seq = inputs[0].ElementsNum() / batch;
730 int export_num = (i != expert_ids_idx) ? Num1 : inputs[i].ElementsNum() / (batch * seq);
731 bool incremental_mode =
732 (static_cast<const int32_t *>(inputs[pos_ids_idx].data().const_data())[0] != 0) ? true : false;
733 size_t h_token = 0;
734 for (int k = 0; k < batch; k++) {
735 int actual_seq_len =
736 (incremental_mode)
737 ? Num1
738 : (static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k] + Num1);
739 int batch_valid = static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k];
740 h_token += (batch_valid == -1) ? 0 : actual_seq_len;
741 }
742 for (int j = 0; j < export_num; j++) {
743 size_t h_token_idx = 0;
744 for (int k = 0; k < batch; k++) {
745 int actual_seq_len =
746 (incremental_mode)
747 ? Num1
748 : (static_cast<const int32_t *>(inputs[trt_in_tensor_name_.size() - Num1].data().const_data())[k] + Num1);
749 for (int l = 0; l < actual_seq_len; l++) {
750 in_ptr[j * h_token + h_token_idx + l] =
751 static_cast<int *>(inputs[i].data_ptr()->data())[j * batch * seq + k * seq + l];
752 }
753 h_token_idx += actual_seq_len;
754 }
755 }
756 return runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], tensor_name, sync,
757 h_token * export_num * sizeof(int));
758 } else if (i != attn_mask_idx && i != current_idx_idx) {
759 return runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], tensor_name, sync);
760 }
761 return RET_OK;
762 }
763
PreExecute(const std::vector<tensor::Tensor> & inputs,const std::vector<tensor::Tensor> & outputs,bool sync)764 int TensorRTSubGraph::PreExecute(const std::vector<tensor::Tensor> &inputs, const std::vector<tensor::Tensor> &outputs,
765 bool sync) {
766 if (inputs_.size() != inputs.size()) {
767 MS_LOG(ERROR) << "Graph inputs size " << inputs_.size() << " != execute inputs size " << inputs.size();
768 return RET_ERROR;
769 }
770 if (!outputs.empty() && outputs.size() != outputs_.size()) {
771 MS_LOG(ERROR) << "Graph outputs size " << outputs_.size() << " != execute outputs size " << outputs.size();
772 return RET_ERROR;
773 }
774 std::vector<ShapeVector> new_shapes;
775 std::transform(inputs.begin(), inputs.end(), std::back_inserter(new_shapes), [](auto &t) { return t.shape_c(); });
776 auto ret = OnNewInputShapes(new_shapes);
777 if (ret != RET_OK) {
778 return ret;
779 }
780 for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) {
781 auto trt_tensor_name = trt_in_tensor_name_[i];
782 void *device_ptr = nullptr;
783 auto input_device_address = inputs[i].device_address();
784 if (input_device_address != nullptr && input_device_address->GetMutablePtr() != nullptr) {
785 device_ptr = input_device_address->GetMutablePtr();
786 } else {
787 device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_tensor_name, inputs_[i].DataSize(),
788 ConvertDataType(inputs_[i].DataType()));
789 if (device_ptr == nullptr) {
790 MS_LOG(ERROR) << "realloc for input tensor device memory failed.";
791 return RET_ERROR;
792 }
793 if (runtime_->IsTransformerOptimizeSigma()) {
794 ret = VSLPreExectute(inputs, i, sync, trt_tensor_name);
795 } else {
796 ret = runtime_->GetAllocator()->SyncMemHostToDevice(inputs[i], trt_tensor_name, sync);
797 }
798 if (ret != RET_OK) {
799 MS_LOG(ERROR) << "sync mem from host to device failed for " << trt_tensor_name;
800 return RET_ERROR;
801 }
802 runtime_->GetAllocator()->MarkMemValid(trt_tensor_name, true);
803 }
804 int index = GetProfileBindingIndex(trt_tensor_name, profile_index_);
805 MS_LOG(INFO) << "device index " << index << " for tensor : " << trt_tensor_name << " attr: " << device_ptr;
806 tensor_bindings_[index] = device_ptr;
807 }
808 for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
809 const auto &trt_out_tensor_name = trt_out_tensor_name_[i];
810 int index = GetProfileBindingIndex(trt_out_tensor_name, profile_index_);
811 void *device_ptr = nullptr;
812 if (outputs.size() > i) {
813 auto &output = outputs[i];
814 if (output.device_address() && output.device_address()->GetMutablePtr()) {
815 if (output.Size() < outputs_[i].DataSize()) {
816 MS_LOG(ERROR) << "Specified output device data size " << output.Size()
817 << " cannot less than execute output data size " << outputs_[i].DataSize()
818 << ", output shape: " << outputs_[i].Shape();
819 return RET_ERROR;
820 }
821 device_ptr = output.device_address()->GetMutablePtr();
822 }
823 }
824 if (!device_ptr) {
825 device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_out_tensor_name, outputs_[i].DataSize(),
826 ConvertDataType(outputs_[i].DataType()));
827 if (device_ptr == nullptr) {
828 MS_LOG(ERROR) << "realloc for outputs tensor device memory failed.";
829 return RET_ERROR;
830 }
831 }
832 tensor_bindings_[index] = device_ptr;
833 }
834 return RET_OK;
835 } // namespace mindspore::lite
836
PostExecute(std::vector<tensor::Tensor> * outputs,bool sync)837 int TensorRTSubGraph::PostExecute(std::vector<tensor::Tensor> *outputs, bool sync) {
838 if (!outputs->empty() && outputs->size() != outputs_.size()) {
839 MS_LOG(ERROR) << "Graph outputs size " << outputs_.size() << " != execute outputs size " << outputs->size();
840 return RET_ERROR;
841 }
842 auto has_outputs = !outputs->empty();
843 for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
844 const auto &trt_out_tensor_name = trt_out_tensor_name_[i];
845 auto index = GetProfileBindingIndex(trt_out_tensor_name, profile_index_);
846 // actual output tensor dims
847 auto out_dims = this->trt_context_->getBindingDimensions(index);
848 std::vector<int64_t> new_shape = lite::ConvertMSShape(out_dims);
849 for (int od = 0; od < out_dims.nbDims; od++) {
850 MS_LOG(DEBUG) << "out tensor " << trt_out_tensor_name << " dims at " << od << " is " << new_shape[od];
851 }
852 runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name, true);
853 if (has_outputs) {
854 auto &tensor = outputs->at(i);
855 auto dst_device = tensor.device_address();
856 if (dst_device == nullptr || dst_device->GetMutablePtr() == nullptr) {
857 if (tensor.Size() < outputs_[i].DataSize()) {
858 MS_LOG(ERROR) << "Specified output host data size " << tensor.Size()
859 << " cannot less than execute output data size " << outputs_[i].DataSize()
860 << ", output shape: " << new_shape;
861 return RET_ERROR;
862 }
863 auto host_address = tensor.data_c();
864 if (host_address == nullptr) {
865 MS_LOG(ERROR) << "Specified output device or host address cannot be nullptr";
866 return RET_ERROR;
867 }
868 int sync_ret = runtime_->GetAllocator()->SyncMemDeviceToHost(host_address, outputs_[i].DataSize(),
869 trt_out_tensor_name, sync);
870 if (sync_ret != RET_OK) {
871 MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name;
872 return sync_ret;
873 }
874 }
875 } else {
876 tensor::Tensor output_tensor(static_cast<enum TypeId>(outputs_[i].DataType()), new_shape);
877 int sync_ret = runtime_->GetAllocator()->SyncMemDeviceToHost(&output_tensor, trt_out_tensor_name, sync);
878 if (sync_ret != RET_OK) {
879 MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name;
880 return sync_ret;
881 }
882 outputs->push_back(output_tensor);
883 }
884 runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name, false);
885 }
886 // make mem invalid, prepare for next execute
887 for (size_t i = 0; i < inputs_.size(); i++) {
888 runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], false);
889 }
890 return RET_OK;
891 }
892
ValidInputResizeDims(const nvinfer1::Dims & construct_dims,const std::vector<int64_t> & resize_input_shape)893 bool TensorRTSubGraph::ValidInputResizeDims(const nvinfer1::Dims &construct_dims,
894 const std::vector<int64_t> &resize_input_shape) {
895 if (static_cast<size_t>(construct_dims.nbDims) != resize_input_shape.size()) {
896 MS_LOG(ERROR) << "invalid resize input.";
897 return false;
898 }
899 return true;
900 }
901
Execute(const std::vector<tensor::Tensor> & inputs,std::vector<tensor::Tensor> * outputs)902 int TensorRTSubGraph::Execute(const std::vector<tensor::Tensor> &inputs, std::vector<tensor::Tensor> *outputs) {
903 #ifdef ASYNC_INFER
904 bool sync = false;
905 #else
906 bool sync = true;
907 #endif
908 int ret = lite::SetCudaDevice(device_info_);
909 if (ret != RET_OK) {
910 return ret;
911 }
912 ret = PreExecute(inputs, *outputs, sync);
913 if (ret != RET_OK) {
914 return ret;
915 }
916 if (sync) {
917 if (!this->trt_context_->executeV2(tensor_bindings_)) {
918 MS_LOG(ERROR) << "TensorRT execute failed.";
919 return RET_ERROR;
920 }
921 } else {
922 if (!this->trt_context_->enqueueV2(tensor_bindings_, stream_, nullptr)) {
923 MS_LOG(ERROR) << "TensorRT execute failed.";
924 return RET_ERROR;
925 }
926 }
927 ret = PostExecute(outputs, sync);
928 if (ret != RET_OK) {
929 return ret;
930 }
931 if (!sync) {
932 cudaStreamSynchronize(stream_);
933 }
934 return RET_OK;
935 }
936
Resize(const std::vector<tensor::Tensor> &,const std::vector<ShapeVector> & new_shapes)937 int TensorRTSubGraph::Resize(const std::vector<tensor::Tensor> &, const std::vector<ShapeVector> &new_shapes) {
938 return OnNewInputShapes(new_shapes);
939 }
940
FindTensorRTInputs(TensorRTOp * cur_op,const TensorInfo & in_tensor)941 ITensorHelper TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const TensorInfo &in_tensor) {
942 for (auto input_op : cur_op->in_ops()) {
943 for (size_t i = 0; i < input_op->outputs().size(); i++) {
944 auto out_tensor = input_op->outputs().at(i);
945 if (in_tensor.Name().compare(out_tensor.Name()) == 0) {
946 return input_op->output(ctx_, i);
947 }
948 }
949 }
950 return ITensorHelper{};
951 }
952 } // namespace mindspore::lite
953