• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "tools/benchmark/benchmark_unified_api.h"
18 #include <cinttypes>
19 #include <algorithm>
20 #include <utility>
21 #include <functional>
22 #include <iomanip>
23 #include <limits>
24 #include "src/common/common.h"
25 #include "src/tensor.h"
26 #include "tools/common/string_util.h"
27 #include "nnacl/nnacl_common.h"
28 #ifdef ENABLE_ARM64
29 #include <linux/perf_event.h>
30 #include <sys/ioctl.h>
31 #include <asm/unistd.h>
32 #include <unistd.h>
33 #endif
34 #ifdef SUPPORT_NNIE
35 #include "include/hi_common.h"
36 #include "include/hi_comm_vb.h"
37 #include "include/mpi_sys.h"
38 #include "include/mpi_vb.h"
39 #endif
40 #ifdef PARALLEL_INFERENCE
41 #include <thread>
42 #include "src/common/config_file.h"
43 #endif
44 #include "include/c_api/model_c.h"
45 #include "include/c_api/context_c.h"
46 
47 namespace mindspore {
48 constexpr size_t kDataToStringMaxNum = 40;
49 constexpr int kPrintDataNum = 20;
50 constexpr int kFrequencyDefault = 3;
51 constexpr int kPercentageDivisor = 100;
52 constexpr int kDumpInputsAndOutputs = 0;
53 constexpr int kDumpOutputs = 2;
54 #ifdef PARALLEL_INFERENCE
55 constexpr int kMaxRequestNum = 200;
56 #endif
57 namespace lite {
GenerateGLTexture(std::map<std::string,GLuint> * input_gl_texture)58 int BenchmarkUnifiedApi::GenerateGLTexture(std::map<std::string, GLuint> *input_gl_texture) {
59   for (auto tensor : ms_inputs_for_api_) {
60     float *input_data = reinterpret_cast<float *>(malloc(tensor.DataSize()));
61     if (input_data == nullptr) {
62       MS_LOG(ERROR) << "new input_data failed";
63       return RET_ERROR;
64     }
65     int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
66     if (status != RET_OK) {
67       free(input_data);
68       std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
69       MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
70       return status;
71     }
72     status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), input_data);
73     free(input_data);
74     if (status != RET_OK) {
75       MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
76       return status;
77     }
78   }
79 
80   return RET_OK;
81 }
82 
FillGLTextureToTensor(std::map<std::string,GLuint> * gl_texture,mindspore::MSTensor * tensor,std::string name,void * data)83 int BenchmarkUnifiedApi::FillGLTextureToTensor(std::map<std::string, GLuint> *gl_texture, mindspore::MSTensor *tensor,
84                                                std::string name, void *data) {
85   MS_CHECK_TRUE_MSG(gl_texture != nullptr, RET_ERROR, "gl_texture is nullptr");
86   MS_CHECK_TRUE_MSG(tensor != nullptr, RET_ERROR, "tensor is nullptr");
87 
88   auto image_id = 0;
89 
90   int width = 1, height = 1, channel = 1;
91   if (tensor->Shape().size() == DIMENSION_2D) {
92     height = tensor->Shape()[kNHWC_N];
93     channel = tensor->Shape()[kNHWC_H];
94   } else if (tensor->Shape().size() == DIMENSION_3D) {
95     width = tensor->Shape()[kNHWC_H];
96     height = tensor->Shape()[kNHWC_N];
97     channel = tensor->Shape()[kNHWC_C];
98   } else if (tensor->Shape().size() == DIMENSION_4D) {
99     width = tensor->Shape()[kNHWC_W];
100     height = tensor->Shape()[kNHWC_H];
101     channel = tensor->Shape()[kNHWC_C];
102   } else {
103     MS_LOG(ERROR) << "the tensor shape is not support";
104     return RET_ERROR;
105   }
106 
107   if (data == nullptr) {
108     image_id = gl_runtime_.GLCreateTexture(width, height, channel);
109   } else {
110     image_id = gl_runtime_.CopyHostToDeviceTexture(data, width, height, channel);
111   }
112 
113   if (image_id != GL_NONE) {
114     gl_texture->insert(std::pair<std::string, GLuint>(name, image_id));
115   } else {
116     MS_LOG(ERROR) << "glMemPool CopyHostToDeviceTexture failed";
117   }
118   return RET_OK;
119 }
120 
LoadAndBindGLTexture()121 int BenchmarkUnifiedApi::LoadAndBindGLTexture() {
122   std::map<std::string, GLuint> input_gl_texture;
123   std::map<std::string, GLuint> output_gl_texture;
124 
125   if (flags_->in_data_file_.empty()) {
126     auto status = GenerateGLTexture(&input_gl_texture);
127     if (status != RET_OK) {
128       std::cerr << "Generate input GLTexture error " << status << std::endl;
129       MS_LOG(ERROR) << "Generate input GLTexture error " << status;
130       return status;
131     }
132   } else {
133     auto status = ReadGLTextureFile(&input_gl_texture);
134     if (status != RET_OK) {
135       std::cerr << "ReadGLTextureFile error, " << status << std::endl;
136       MS_LOG(ERROR) << "ReadGLTextureFile error, " << status;
137       return status;
138     }
139   }
140 
141   for (auto &tensor : ms_outputs_for_api_) {
142     auto status = FillGLTextureToTensor(&output_gl_texture, &tensor, tensor.Name());
143     if (status != RET_OK) {
144       MS_LOG(ERROR) << "Fill GLTexture to output tensor" << status;
145       return status;
146     }
147   }
148 
149   auto status = ms_model_.BindGLTexture2DMemory(input_gl_texture, &output_gl_texture);
150   if (status != kSuccess) {
151     MS_LOG(ERROR) << "BindGLTexture2DMemory failed";
152     return RET_ERROR;
153   }
154   return RET_OK;
155 }
156 
ReadGLTextureFile(std::map<std::string,GLuint> * input_gl_texture)157 int BenchmarkUnifiedApi::ReadGLTextureFile(std::map<std::string, GLuint> *input_gl_texture) {
158   if (ms_inputs_for_api_.empty()) {
159     return RET_OK;
160   }
161   if (this->flags_->in_data_type_ == kImage) {
162     MS_LOG(ERROR) << "Not supported image input";
163     return RET_ERROR;
164   } else {
165     for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
166       auto tensor = ms_inputs_for_api_.at(i);
167       size_t size;
168       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
169       if (bin_buf == nullptr) {
170         MS_LOG(ERROR) << "ReadFile return nullptr";
171         return RET_ERROR;
172       }
173       auto tensor_data_size = tensor.DataSize();
174       if (size != tensor_data_size) {
175         std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
176                   << std::endl;
177         MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
178         delete[] bin_buf;
179         return RET_ERROR;
180       }
181 
182       auto status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), bin_buf);
183       delete[] bin_buf;
184       if (status != RET_OK) {
185         MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
186         return status;
187       }
188     }
189   }
190 
191   return RET_OK;
192 }
193 
LoadInput()194 int BenchmarkUnifiedApi::LoadInput() {
195   if (flags_->enable_gl_texture_ == true) {
196     if (lite::BenchmarkUnifiedApi::LoadAndBindGLTexture() != RET_OK) {
197       MS_LOG(ERROR) << "Generate input GLTexture error";
198       return RET_ERROR;
199     }
200     return RET_OK;
201   }
202 
203   if (flags_->in_data_file_.empty()) {
204     auto status = GenerateInputData();
205     if (status != RET_OK) {
206       std::cerr << "Generate input data error " << status << std::endl;
207       MS_LOG(ERROR) << "Generate input data error " << status;
208       return status;
209     }
210   } else {
211     auto status = ReadInputFile();
212     if (status != RET_OK) {
213       std::cerr << "ReadInputFile error, " << status << std::endl;
214       MS_LOG(ERROR) << "ReadInputFile error, " << status;
215       return status;
216     }
217   }
218   return RET_OK;
219 }
220 
GenerateInputData()221 int BenchmarkUnifiedApi::GenerateInputData() {
222 #ifdef PARALLEL_INFERENCE
223   if (flags_->enable_parallel_predict_) {
224     std::vector<void *> inputs;
225     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
226       auto tensor_name = ms_inputs_for_api_[i].Name();
227       size_t size;
228       if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeFloat32)) {
229         size = sizeof(float);
230       } else if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeInt32)) {
231         size = sizeof(int32_t);
232       } else {
233         MS_LOG(ERROR) << "not support in model pool.";
234         return RET_ERROR;
235       }
236       for (size_t j = 0; j < flags_->resize_dims_[i].size(); j++) {
237         size *= flags_->resize_dims_[i][j];
238       }
239       void *input_data = new (std::nothrow) char[size];
240       if (input_data == nullptr) {
241         MS_LOG(ERROR) << "new input_data failed";
242         for (auto &data : inputs) {
243           auto buf = static_cast<char *>(data);
244           delete[] buf;
245           data = nullptr;
246         }
247         return RET_ERROR;
248       }
249       inputs.push_back(input_data);
250       int status = GenerateRandomData(size, input_data, static_cast<int>(ms_inputs_for_api_[i].DataType()));
251       if (status != RET_OK) {
252         MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
253         for (auto &data : inputs) {
254           auto buf = static_cast<char *>(data);
255           delete[] buf;
256           data = nullptr;
257         }
258         return status;
259       }
260     }
261     all_inputs_data_.push_back(inputs);
262     return RET_OK;
263   }
264 #endif
265   for (auto &tensor : ms_inputs_for_api_) {
266     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
267       MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
268       if (input == nullptr) {
269         std::cerr << "StringsToTensor failed" << std::endl;
270         MS_LOG(ERROR) << "StringsToTensor failed";
271         return RET_ERROR;
272       }
273       tensor = *input;
274       delete input;
275     } else {
276       auto input_data = tensor.MutableData();
277       if (input_data == nullptr) {
278         MS_LOG(ERROR) << "MallocData for inTensor failed";
279         return RET_ERROR;
280       }
281       int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
282       if (status != RET_OK) {
283         std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
284         MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
285         return status;
286       }
287     }
288   }
289   return RET_OK;
290 }
291 
UpdateConfigInfo()292 void BenchmarkUnifiedApi::UpdateConfigInfo() {
293 #define WIPE_DEEP_CONFIG_ENV '0'
294 #define WIPE_DEEP_CONFIG_VOCAB_SIZE "100"
295 #define WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE "40"
296 
297   auto env = std::getenv("BENCHMARK_UPDATE_CONFIG_ENV");
298   if (env == nullptr) {
299     return;
300   }
301   if (env[0] == WIPE_DEEP_CONFIG_ENV) {
302     ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheVocabSizeKey, WIPE_DEEP_CONFIG_VOCAB_SIZE));
303     ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheDeviceSizeKey, WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE));
304   }
305   return;
306 }
307 
ReadInputFile()308 int BenchmarkUnifiedApi::ReadInputFile() {
309 #ifdef PARALLEL_INFERENCE
310   if (flags_->enable_parallel_predict_) {
311     std::vector<void *> inputs;
312     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
313       size_t size;
314       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
315       if (bin_buf == nullptr) {
316         MS_LOG(ERROR) << "ReadFile return nullptr";
317         for (auto &data : inputs) {
318           auto buf = static_cast<char *>(data);
319           delete[] buf;
320           data = nullptr;
321         }
322         return RET_ERROR;
323       }
324       inputs.push_back(bin_buf);
325     }
326     all_inputs_data_.push_back(inputs);
327     return RET_OK;
328   }
329 #endif
330   if (ms_inputs_for_api_.empty()) {
331     return RET_OK;
332   }
333 
334   if (this->flags_->in_data_type_ == kImage) {
335     MS_LOG(ERROR) << "Not supported image input";
336     return RET_ERROR;
337   } else {
338     for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
339       auto &cur_tensor = ms_inputs_for_api_.at(i);
340       size_t size;
341       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
342       if (bin_buf == nullptr) {
343         MS_LOG(ERROR) << "ReadFile return nullptr";
344         return RET_ERROR;
345       }
346       if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
347         std::string str(bin_buf, size);
348         MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
349         if (input == nullptr) {
350           std::cerr << "StringsToTensor failed" << std::endl;
351           MS_LOG(ERROR) << "StringsToTensor failed";
352           delete[] bin_buf;
353           return RET_ERROR;
354         }
355         cur_tensor = *input;
356       } else {
357         auto tensor_data_size = cur_tensor.DataSize();
358         if (size != tensor_data_size) {
359           std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
360                     << std::endl;
361           MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
362           delete[] bin_buf;
363           return RET_ERROR;
364         }
365         auto input_data = cur_tensor.MutableData();
366         if (input_data == nullptr) {
367           MS_LOG(ERROR) << "input_data is nullptr.";
368           delete[] bin_buf;
369           return RET_ERROR;
370         }
371         memcpy(input_data, bin_buf, tensor_data_size);
372       }
373       delete[] bin_buf;
374     }
375   }
376   return RET_OK;
377 }
378 
GetDataTypeByTensorName(const std::string & tensor_name)379 int BenchmarkUnifiedApi::GetDataTypeByTensorName(const std::string &tensor_name) {
380 #ifdef PARALLEL_INFERENCE
381   for (auto tensor : ms_outputs_for_api_) {
382     auto name = tensor.Name();
383     if (name == tensor_name) {
384       return static_cast<int>(tensor.DataType());
385     }
386   }
387   MS_LOG(ERROR) << "not find tensor name : " << tensor_name << " in model output.";
388   return static_cast<int>(DataType::kTypeUnknown);
389 #endif
390   return static_cast<int>(ms_model_.GetOutputByTensorName(tensor_name).DataType());
391 }
392 
UpdateDistributionName(const std::shared_ptr<mindspore::Context> & context,std::string * name)393 void BenchmarkUnifiedApi::UpdateDistributionName(const std::shared_ptr<mindspore::Context> &context,
394                                                  std::string *name) {
395   if (flags_->device_ != "GPU") {
396     return;
397   }
398 
399   if (name->size() == 0) {
400     return;
401   }
402 
403   if (context->MutableDeviceInfo().size() == 0) {
404     return;
405   }
406 
407   auto device_info = context->MutableDeviceInfo().front();
408   GPUDeviceInfo *gpu_info = reinterpret_cast<GPUDeviceInfo *>(device_info.get());
409   auto rank_id = gpu_info->GetRankID();
410   if (rank_id == 0) {
411     return;
412   }
413   gpu_info->SetDeviceID(rank_id);
414 
415   /* model file & benchmark data file: include .mindir
416    config file :  include .config */
417   auto replace_pos = name->find(".mindir");
418   if (replace_pos == std::string::npos) {
419     replace_pos = name->find(".config");
420   }
421 
422   if (replace_pos == std::string::npos) {
423     return;
424   }
425 
426   *name = name->replace(replace_pos, sizeof('.'), std::to_string(rank_id) + ".");
427 
428   MS_LOG(INFO) << "Update distribution info: " << *name;
429   std::cout << "Update distribution info: " << *name << std::endl;
430   return;
431 }
432 
InitMSContextForGPU(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)433 void BenchmarkUnifiedApi::InitMSContextForGPU(const std::shared_ptr<mindspore::Context> &context,
434                                               std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
435   std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
436   gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
437   uint32_t device_id = 0;
438   auto device_id_env = std::getenv("GPU_DEVICE_ID");
439   if (device_id_env != nullptr) {
440 //    try {
441       device_id = static_cast<uint32_t>(std::stoul(device_id_env));
442 //    } catch (std::invalid_argument &e) {
443 //      MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
444 //    }
445     MS_LOG(INFO) << "GPU device_id = " << device_id;
446   }
447   gpu_device_info->SetDeviceID(device_id);
448   if (flags_->device_id_ >= 0) {
449     gpu_device_info->SetDeviceID(flags_->device_id_);
450     MS_LOG(INFO) << "GPU device_id = " << flags_->device_id_;
451   }
452   if (flags_->enable_gl_texture_) {
453     gpu_device_info->SetEnableGLTexture(flags_->enable_gl_texture_);
454 
455     auto gl_context = eglGetCurrentContext();
456     gpu_device_info->SetGLContext(gl_context);
457 
458     auto gl_display = eglGetCurrentDisplay();
459     gpu_device_info->SetGLDisplay(gl_display);
460   } else {
461     gpu_device_info->SetProvider("tensorrt");
462     gpu_device_info->SetAllocator(nullptr);
463   }
464   device_list->push_back(gpu_device_info);
465 }
466 
InitMSContextForAscend(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)467 void BenchmarkUnifiedApi::InitMSContextForAscend(const std::shared_ptr<mindspore::Context> &context,
468                                                  std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
469   uint32_t device_id = 0;
470   auto device_id_env = std::getenv("ASCEND_DEVICE_ID");
471   if (device_id_env != nullptr) {
472 //    try {
473       device_id = static_cast<uint32_t>(std::stoul(device_id_env));
474 //    } catch (std::invalid_argument &e) {
475 //      MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
476 //    }
477     MS_LOG(INFO) << "Ascend device_id = " << device_id;
478   }
479   std::shared_ptr<AscendDeviceInfo> ascend_device_info = std::make_shared<AscendDeviceInfo>();
480   ascend_device_info->SetDeviceID(device_id);
481   ascend_device_info->SetProvider(flags_->provider_);
482   auto back_policy_env = std::getenv("ASCEND_BACK_POLICY");
483   if (back_policy_env != nullptr) {
484     ascend_device_info->SetProvider(back_policy_env);
485   }
486 #ifdef ENABLE_CLOUD_FUSION_INFERENCE
487   if (flags_->device_id_ >= 0 && flags_->rank_id_ >= 0) {
488     ascend_device_info->SetDeviceID(flags_->device_id_);
489     ascend_device_info->SetRankID(flags_->rank_id_);
490     ascend_device_info->SetProvider("ge");
491   }
492 #endif
493   device_list->push_back(ascend_device_info);
494 }
495 
InitMSContext(const std::shared_ptr<mindspore::Context> & context)496 int BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
497   context->SetThreadNum(flags_->num_threads_);
498   context->SetGroupInfoFile(flags_->group_info_file_);
499   context->SetThreadAffinity(flags_->cpu_bind_mode_);
500   context->SetInterOpParallelNum(flags_->inter_op_parallel_num_);
501   if (!flags_->core_list_.empty()) {
502     context->SetThreadAffinity(flags_->core_list_);
503   }
504 #ifndef ENABLE_CLOUD_FUSION_INFERENCE
505   if (flags_->delegate_mode_ == "CoreML") {
506     context->SetBuiltInDelegate(kCoreML);
507   } else if (flags_->delegate_mode_ == "NNAPI") {
508     context->SetBuiltInDelegate(kNNAPI);
509   }
510   context->SetEnableParallel(flags_->enable_parallel_);
511 #endif
512 
513   auto &device_list = context->MutableDeviceInfo();
514   if (flags_->device_ == "GPU" || flags_->device_ == "Auto") {
515     InitMSContextForGPU(context, &device_list);
516   }
517 
518   if (flags_->device_ == "NPU" || flags_->device_ == "Auto") {
519     std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
520     npu_device_info->SetEnableFP16(flags_->enable_fp16_);
521     npu_device_info->SetFrequency(kFrequencyDefault);
522     device_list.push_back(npu_device_info);
523   }
524 
525   if (flags_->device_ == "Ascend" || flags_->device_ == "Auto") {
526     MS_LOG(ERROR) << "OHOS not support Ascend devices.";
527     return RET_NOT_SUPPORT;
528   }
529 
530   if (flags_->device_ == "NNRT" || flags_->device_ == "Auto") {
531     std::shared_ptr<NNRTDeviceInfo> nnrt_device_info = std::make_shared<NNRTDeviceInfo>();
532     size_t num = 0;
533     auto descs = OH_AI_GetAllNNRTDeviceDescs(&num);
534     NNRTDeviceDesc *desc_nnrt = nullptr;
535     for (size_t i = 0; i < num; i++) {
536       auto desc = OH_AI_GetElementOfNNRTDeviceDescs(descs, i);
537       auto name = OH_AI_GetNameFromNNRTDeviceDesc(desc);
538       if (strncmp(name, "NPU_", 4) == 0 ) {  // npu推理,在线编译
539         desc_nnrt = desc;
540         break;
541       }
542     }
543     if (desc_nnrt == nullptr) {
544       BENCHMARK_LOG_ERROR("nnrt desc get failed");
545       return RET_ERROR;
546     }
547     auto id = OH_AI_GetDeviceIdFromNNRTDeviceDesc(desc_nnrt);
548     nnrt_device_info->SetDeviceID(id);
549     nnrt_device_info->SetPerformanceMode(flags_->nnrt_performance_mode_);
550     OH_AI_DestroyAllNNRTDeviceDescs(&descs);
551     device_list.push_back(nnrt_device_info);
552   }
553 
554   // CPU priority is behind GPU and NPU
555   std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
556   device_info->SetEnableFP16(flags_->enable_fp16_);
557   device_info->SetProvider(flags_->provider_);
558   device_list.push_back(device_info);
559 
560   return RET_OK;
561 }
562 #ifdef PARALLEL_INFERENCE
CompareOutputForModelPool(std::vector<mindspore::MSTensor> * outputs)563 int BenchmarkUnifiedApi::CompareOutputForModelPool(std::vector<mindspore::MSTensor> *outputs) {
564   if (outputs->empty()) {
565     MS_LOG(ERROR) << "outputs is empty.";
566     return RET_ERROR;
567   }
568   std::cout << "================ Comparing Output data ================" << std::endl;
569   float total_bias = 0;
570   int total_size = 0;
571   // check the output tensor name.
572   for (size_t i = 0; i < outputs->size(); i++) {
573     std::string tensor_name = outputs->at(i).Name();
574     mindspore::MSTensor tensor = outputs->at(i);
575     if (tensor == nullptr) {
576       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
577       return RET_ERROR;
578     }
579     constexpr float kParallelRelative = 1e-7;
580     constexpr float kParallelAbsolute = 1e-10;
581     int ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size, kParallelRelative,
582                                              kParallelAbsolute);
583     if (ret != RET_OK) {
584       MS_LOG(ERROR) << "Error in CompareData";
585       std::cerr << "Error in CompareData" << std::endl;
586       std::cout << "=======================================================" << std::endl << std::endl;
587       return ret;
588     }
589   }
590   float mean_bias;
591   if (total_size != 0) {
592     mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
593   } else {
594     mean_bias = 0;
595   }
596 
597   std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
598   std::cout << "=======================================================" << std::endl << std::endl;
599 
600   if (mean_bias > this->flags_->accuracy_threshold_) {
601     MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
602     std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
603     return RET_ERROR;
604   }
605   return RET_OK;
606 }
607 #endif
608 
Convert2Float32(float * __restrict out,const uint16_t in)609 void Convert2Float32(float *__restrict out, const uint16_t in) {
610   uint32_t t1;
611   uint32_t t2;
612   uint32_t t3;
613 
614   t1 = in & 0x7fffu;
615   t2 = in & 0x8000u;
616   t3 = in & 0x7c00u;
617 
618   t1 <<= 13u;
619   t2 <<= 16u;
620 
621   t1 += 0x38000000;
622 
623   t1 = (t3 == 0 ? 0 : t1);
624 
625   t1 |= t2;
626 
627   *(out) = static_cast<float>(t1);
628 }
629 
630 namespace {
631 template <typename T>
VectorValueCompare(const std::vector<T> & vec1,const std::vector<T> & vec2)632 bool VectorValueCompare(const std::vector<T> &vec1, const std::vector<T> &vec2) {
633   if (vec1.size() != vec2.size()) {
634     return false;
635   }
636   for (auto &ele : vec1) {
637     if (!IsContain(vec2, ele)) {
638       return false;
639     }
640   }
641   return true;
642 }
643 }  // namespace
644 
CompareOutput()645 int BenchmarkUnifiedApi::CompareOutput() {
646   std::cout << "================ Comparing Output data ================" << std::endl;
647   float total_bias = 0;
648   int total_size = 0;
649   // check the output tensor name.
650   if (!VectorValueCompare(this->benchmark_tensor_names_, ms_model_.GetOutputTensorNames())) {
651     MS_LOG(ERROR) << "The output tensor name is wrong.";
652     return RET_ERROR;
653   }
654   for (const auto &calib_tensor : benchmark_data_) {
655     std::string tensor_name = calib_tensor.first;
656     mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
657     if (tensor == nullptr) {
658       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
659       return RET_ERROR;
660     }
661     int ret;
662     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
663       std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
664       ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
665     } else {
666       if (flags_->enable_gl_texture_) {
667         auto *gltexture_id = reinterpret_cast<GLuint *>(tensor.MutableData());
668         if (gltexture_id == nullptr) {
669           MS_LOG(ERROR) << "get gltexture_id failed";
670           return RET_ERROR;
671         }
672         auto tmp = gl_runtime_.CopyDeviceTextureToHost(*gltexture_id);
673         if (tmp == nullptr) {
674           MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
675           return RET_ERROR;
676         }
677         float *hostptr = reinterpret_cast<float *>(tmp);
678 
679         auto tensor_shape = tensor.Shape();
680         auto data_len =
681           std::accumulate(tensor_shape.begin(), tensor_shape.end(), sizeof(float), std::multiplies<size_t>());
682         auto *new_tensor = new (std::nothrow)
683           MSTensor(tensor_name, mindspore::DataType::kNumberTypeFloat32, tensor_shape, hostptr, data_len);
684         MS_CHECK_TRUE_MSG(new_tensor != nullptr, RET_ERROR, "new tensor failed");
685         if (new_tensor->MutableData() == nullptr) {
686           MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
687           delete new_tensor;
688           return RET_ERROR;
689         }
690         ret = CompareDataGetTotalBiasAndSize(tensor_name, new_tensor, &total_bias, &total_size);
691         delete new_tensor;
692       } else {
693         ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
694       }
695     }
696     if (ret != RET_OK) {
697       MS_LOG(ERROR) << "Error in CompareData";
698       std::cerr << "Error in CompareData" << std::endl;
699       std::cout << "=======================================================" << std::endl << std::endl;
700       return ret;
701     }
702   }
703   float mean_bias;
704   if (total_size != 0) {
705     mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
706   } else {
707     mean_bias = 0;
708   }
709 
710   std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
711   std::cout << "=======================================================" << std::endl << std::endl;
712 
713   if (mean_bias > this->flags_->accuracy_threshold_) {
714     MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
715     std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
716     return RET_ERROR;
717   }
718   return RET_OK;
719 }
720 
CompareOutputByCosineDistance(float cosine_distance_threshold)721 int BenchmarkUnifiedApi::CompareOutputByCosineDistance(float cosine_distance_threshold) {
722   std::cout << "================ Comparing Output data ================" << std::endl;
723   float total_cosine_distance = 0;
724   int total_size = 0;
725   // check the output tensor name.
726   if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
727     MS_LOG(ERROR) << "The output tensor name is wrong.";
728     return RET_ERROR;
729   }
730   for (const auto &calib_tensor : benchmark_data_) {
731     std::string tensor_name = calib_tensor.first;
732     mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
733     if (tensor == nullptr) {
734       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
735       return RET_ERROR;
736     }
737     int ret;
738     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
739       std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
740       ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
741     } else {
742       ret = CompareDataGetTotalCosineDistanceAndSize(tensor_name, &tensor, &total_cosine_distance, &total_size);
743     }
744     if (ret != RET_OK) {
745       MS_LOG(ERROR) << "Error in CompareData";
746       std::cerr << "Error in CompareData" << std::endl;
747       std::cout << "=======================================================" << std::endl << std::endl;
748       return ret;
749     }
750   }
751   float mean_cosine_distance;
752   if (total_size != 0) {
753     mean_cosine_distance = total_cosine_distance / float_t(total_size);
754   } else {
755     mean_cosine_distance = CosineErrMaxVal;
756   }
757   mean_cosine_distance = 1 - mean_cosine_distance;
758   std::cout << "Cosine distance of all nodes/tensors: " << std::setprecision(std::numeric_limits<double>::digits10)
759             << mean_cosine_distance << std::endl;
760   std::cout << "=======================================================" << std::endl << std::endl;
761 
762   if (mean_cosine_distance < cosine_distance_threshold) {
763     MS_LOG(ERROR) << "cosine distance of all nodes/tensors is too small: " << mean_cosine_distance;
764     std::cerr << "Mean cosine distance of all nodes/tensors is too small: " << mean_cosine_distance << std::endl;
765     return RET_ERROR;
766   }
767   return RET_OK;
768 }
769 
CompareDataGetTotalBiasAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_bias,int * total_size,float relative_tolerance,float absolute_tolerance)770 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
771                                                         float *total_bias, int *total_size, float relative_tolerance,
772                                                         float absolute_tolerance) {
773   float bias = 0;
774   auto mutableData = tensor->MutableData();
775   if (mutableData == nullptr) {
776     MS_LOG(ERROR) << "mutableData is nullptr.";
777     return RET_ERROR;
778   }
779   switch (static_cast<int>(tensor->DataType())) {
780     case TypeId::kNumberTypeFloat:
781     case TypeId::kNumberTypeFloat32: {
782       bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
783       break;
784     }
785     case TypeId::kNumberTypeInt8: {
786       bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
787       break;
788     }
789     case TypeId::kNumberTypeUInt8: {
790       bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
791       break;
792     }
793     case TypeId::kNumberTypeInt32: {
794       bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
795       break;
796     }
797     case TypeId::kNumberTypeInt16: {
798       bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
799       break;
800     }
801     case TypeId::kNumberTypeBool: {
802       bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
803       break;
804     }
805     case TypeId::kNumberTypeFloat16: {
806       size_t shapeSize = 1;
807       for (int64_t dim : tensor->Shape()) {
808         if (dim <= 0) {
809           MS_LOG(ERROR) << "The shape of output " << name << " should be great than 0 after inference, got "
810                         << tensor->Shape();
811           return RET_ERROR;
812         }
813         MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
814         shapeSize *= static_cast<size_t>(dim);
815       }
816       auto *floatArr = new float[shapeSize];
817       for (size_t i = 0; i < shapeSize; ++i) {
818         uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
819         floatArr[i] = ShortToFloat32(tmpInt);
820       }
821       bias = CompareData<float, int64_t>(name, tensor->Shape(), floatArr);
822       delete[] floatArr;
823       break;
824     }
825     default:
826       MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
827       return RET_ERROR;
828   }
829   if (bias < 0) {
830     MS_LOG(ERROR) << "CompareData failed, name: " << name;
831     return RET_ERROR;
832   }
833   *total_bias += bias;
834   *total_size += 1;
835   return RET_OK;
836 }
CompareDataGetTotalCosineDistanceAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_cosine_distance,int * total_size)837 int BenchmarkUnifiedApi::CompareDataGetTotalCosineDistanceAndSize(const std::string &name, mindspore::MSTensor *tensor,
838                                                                   float *total_cosine_distance, int *total_size) {
839   if (tensor == nullptr) {
840     MS_LOG(ERROR) << "tensor is nullptr.";
841     return RET_ERROR;
842   }
843   if (total_cosine_distance == nullptr) {
844     MS_LOG(ERROR) << "total_cosine_distance is nullptr.";
845     return RET_ERROR;
846   }
847   if (total_size == nullptr) {
848     MS_LOG(ERROR) << "total_size is nullptr.";
849     return RET_ERROR;
850   }
851   float bias = 0;
852   auto mutableData = tensor->MutableData();
853   if (mutableData == nullptr) {
854     MS_LOG(ERROR) << "mutableData is nullptr.";
855     return RET_ERROR;
856   }
857   int res = RET_OK;
858   switch (static_cast<int>(tensor->DataType())) {
859     case TypeId::kNumberTypeFloat:
860     case TypeId::kNumberTypeFloat32: {
861       res = CompareDatabyCosineDistance<float>(name, tensor->Shape(), mutableData, &bias);
862       break;
863     }
864     case TypeId::kNumberTypeFloat16: {
865       size_t shapeSize = 1;
866       for (int64_t dim : tensor->Shape()) {
867         if (dim <= 0) {
868           MS_LOG(ERROR) << "Invalid shape.";
869           return RET_ERROR;
870         }
871         MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
872         shapeSize *= static_cast<size_t>(dim);
873       }
874       float *floatArr = new float[shapeSize];
875       for (size_t i = 0; i < shapeSize; ++i) {
876         uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
877         Convert2Float32(&floatArr[i], tmpInt);
878         reinterpret_cast<float *>(mutableData)[i] = floatArr[i];
879       }
880       delete[] floatArr;
881       bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
882       break;
883     }
884     case TypeId::kNumberTypeInt8: {
885       res = CompareDatabyCosineDistance<int8_t>(name, tensor->Shape(), mutableData, &bias);
886       break;
887     }
888     case TypeId::kNumberTypeUInt8: {
889       res = CompareDatabyCosineDistance<uint8_t>(name, tensor->Shape(), mutableData, &bias);
890       break;
891     }
892     case TypeId::kNumberTypeInt32: {
893       res = CompareDatabyCosineDistance<int32_t>(name, tensor->Shape(), mutableData, &bias);
894       break;
895     }
896     case TypeId::kNumberTypeInt16: {
897       res = CompareDatabyCosineDistance<int16_t>(name, tensor->Shape(), mutableData, &bias);
898       break;
899     }
900     case TypeId::kNumberTypeBool: {
901       res = CompareDatabyCosineDistance<bool>(name, tensor->Shape(), mutableData, &bias);
902       break;
903     }
904     default:
905       MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
906       return RET_ERROR;
907   }
908   if (res != RET_OK) {
909     MS_LOG(ERROR) << "CompareData failed, name: " << name;
910     return RET_ERROR;
911   }
912   *total_cosine_distance += 1 - bias;
913   *total_size += 1;
914   return RET_OK;
915 }
916 
MarkPerformance()917 int BenchmarkUnifiedApi::MarkPerformance() {
918   MS_LOG(INFO) << "Running warm up loops...";
919   std::cout << "Running warm up loops..." << std::endl;
920   std::vector<MSTensor> outputs;
921   for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
922     auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
923     if (status != kSuccess) {
924       MS_LOG(ERROR) << "Inference error ";
925       std::cerr << "Inference error " << std::endl;
926       return RET_ERROR;
927     }
928   }
929 
930   MS_LOG(INFO) << "Running benchmark loops...";
931   std::cout << "Running benchmark loops..." << std::endl;
932   uint64_t time_min = UINT64_MAX;
933   uint64_t time_max = 0;
934   uint64_t time_avg = 0;
935 
936   for (int i = 0; i < flags_->loop_count_; i++) {
937     auto inputs = ms_model_.GetInputs();
938     for (auto tensor : inputs) {
939       tensor.MutableData();  // prepare data
940     }
941     auto start = GetTimeUs();
942     auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
943     if (status != kSuccess) {
944       MS_LOG(ERROR) << "Inference error ";
945       std::cerr << "Inference error ";
946       return RET_ERROR;
947     }
948 
949     auto end = GetTimeUs();
950     auto time = end - start;
951     time_min = std::min(time_min, time);
952     time_max = std::max(time_max, time);
953     time_avg += time;
954   }
955 
956   if (flags_->time_profiling_) {
957     const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
958     const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
959     (void)PrintResult(per_op_name, op_times_by_name_);
960     (void)PrintResult(per_op_type, op_times_by_type_);
961 #ifdef ENABLE_ARM64
962   } else if (flags_->perf_profiling_) {
963     if (flags_->perf_event_ == "CACHE") {
964       const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
965       const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
966       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
967       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
968     } else if (flags_->perf_event_ == "STALL") {
969       const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
970                                                     "backendend(%)"};
971       const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
972                                                     "backendend(%)"};
973       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
974       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
975     } else {
976       const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
977       const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
978       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
979       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
980     }
981 #endif
982   }
983 
984   if (flags_->loop_count_ > 0) {
985     time_avg /= static_cast<size_t>(flags_->loop_count_);
986     MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
987                  << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
988                  << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
989     printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
990            flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
991            time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
992   }
993   return RET_OK;
994 }
995 
MarkAccuracy()996 int BenchmarkUnifiedApi::MarkAccuracy() {
997   MS_LOG(INFO) << "MarkAccuracy";
998   std::cout << "MarkAccuracy" << std::endl;
999 
1000   int status = 0;
1001   if (flags_->enable_gl_texture_) {
1002     for (auto in_tensor : ms_inputs_for_api_) {
1003       auto *input = reinterpret_cast<GLuint *>(in_tensor.MutableData());
1004       if (input == nullptr) {
1005         MS_LOG(ERROR) << "get input data failed";
1006         return RET_ERROR;
1007       }
1008       float *hostptr = reinterpret_cast<float *>(gl_runtime_.CopyDeviceTextureToHost(*input));
1009       size_t print_num = 20;
1010       gl_runtime_.PrintImage2DData(hostptr, 1, 1, print_num);
1011     }
1012   } else {
1013     status = PrintInputData();
1014     if (status != RET_OK) {
1015       MS_LOG(ERROR) << "PrintInputData error " << status;
1016       std::cerr << "PrintInputData error " << status << std::endl;
1017       return status;
1018     }
1019   }
1020   std::vector<MSTensor> outputs;
1021   auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
1022   if (ret != kSuccess) {
1023     MS_LOG(ERROR) << "Inference error ";
1024     std::cerr << "Inference error " << std::endl;
1025     return RET_ERROR;
1026   }
1027   status = ReadCalibData();
1028   if (status != RET_OK) {
1029     MS_LOG(ERROR) << "Read calib data error " << status;
1030     std::cerr << "Read calib data error " << status << std::endl;
1031     return status;
1032   }
1033   status = CompareOutput();
1034   if (status != RET_OK) {
1035     MS_LOG(ERROR) << "Compare output error " << status;
1036     std::cerr << "Compare output error " << status << std::endl;
1037     return status;
1038   }
1039   if (this->flags_->cosine_distance_threshold_ >= -1) {
1040     status = CompareOutputByCosineDistance(this->flags_->cosine_distance_threshold_);
1041     if (status != RET_OK) {
1042       MS_LOG(ERROR) << "Compare output error by consine distance " << status;
1043       std::cerr << "Compare output error by consine distance" << status << std::endl;
1044       return status;
1045     }
1046   }
1047   return RET_OK;
1048 }
1049 
PrintInputData()1050 int BenchmarkUnifiedApi::PrintInputData() {
1051   for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1052     mindspore::MSTensor input = ms_inputs_for_api_[i];
1053     auto tensor_data_type = static_cast<int>(input.DataType());
1054 
1055     std::cout << "InData " << i << ": ";
1056     if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1057       MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1058       continue;
1059     }
1060     if (tensor_data_type == TypeId::kObjectTypeString) {
1061       std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1062       size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1063       for (size_t j = 0; j < print_num; j++) {
1064         std::cout << output_strings[j] << std::endl;
1065       }
1066       continue;
1067     }
1068     size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1069     const void *in_data = input.MutableData();
1070     if (in_data == nullptr) {
1071       MS_LOG(ERROR) << "in_data is nullptr.";
1072       return RET_ERROR;
1073     }
1074 
1075     for (size_t j = 0; j < print_num; j++) {
1076       if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1077         std::cout << static_cast<const float *>(in_data)[j] << " ";
1078       } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1079         std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1080       } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1081         std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1082       } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1083         std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1084       } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1085         std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1086       } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1087         std::cout << static_cast<const bool *>(in_data)[j] << " ";
1088       } else {
1089         MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1090         return RET_ERROR;
1091       }
1092     }
1093     std::cout << std::endl;
1094   }
1095   return RET_OK;
1096 }
1097 #ifdef PARALLEL_INFERENCE
ModelParallelRunnerWarmUp(int index)1098 void BenchmarkUnifiedApi::ModelParallelRunnerWarmUp(int index) {
1099   auto in = model_runner_.GetInputs();
1100   for (size_t i = 0; i < in.size(); i++) {
1101     in[i].SetShape(resize_dims_[i]);
1102     in[i].SetData(all_inputs_data_[index][i], false);
1103   }
1104   auto warm_up_start = GetTimeUs();
1105   std::vector<MSTensor> output;
1106   auto ret = model_runner_.Predict(in, &output);
1107   for (size_t j = 0; j < in.size(); j++) {
1108     in[j].SetData(nullptr);
1109   }
1110   if (ret != kSuccess) {
1111     model_parallel_runner_ret_failed_ = true;
1112     MS_LOG(ERROR) << "model pool predict failed.";
1113     return;
1114   }
1115   auto warm_up_end = GetTimeUs();
1116   std::cout << "warm up index: " << index << " | time: " << (warm_up_end - warm_up_start) / kFloatMSEC << " ms\n";
1117 }
1118 
ModelParallelRunnerRun(int task_num,int parallel_idx)1119 void BenchmarkUnifiedApi::ModelParallelRunnerRun(int task_num, int parallel_idx) {
1120   for (int i = 0; i < task_num || task_num == -1; i++) {
1121     while (!runner_run_start_) {
1122       continue;
1123     }
1124     int idx = parallel_idx + flags_->warm_up_loop_count_;
1125     auto in = model_runner_.GetInputs();
1126     if (idx >= static_cast<int>(all_inputs_data_.size())) {
1127       MS_LOG(ERROR) << "idx is to big :" << idx;
1128       return;
1129     }
1130     auto in_data = all_inputs_data_[idx];
1131     for (size_t tensor_index = 0; tensor_index < in.size(); tensor_index++) {
1132       in.at(tensor_index).SetShape(resize_dims_.at(tensor_index));
1133       in.at(tensor_index).SetData(all_inputs_data_.at(idx)[tensor_index], false);
1134     }
1135     auto predict_start = GetTimeUs();
1136     std::vector<MSTensor> output;
1137     auto ret = model_runner_.Predict(in, &output);
1138     if (ret != kSuccess) {
1139       model_parallel_runner_ret_failed_ = true;
1140       MS_LOG(ERROR) << "model pool predict failed.";
1141       for (auto &item : in) {
1142         item.SetData(nullptr);
1143       }
1144       return;
1145     }
1146     auto predict_end = GetTimeUs();
1147     std::cout << "parallel index: " << parallel_idx << " | task index: " << i
1148               << " | predict time: " << (predict_end - predict_start) / kFloatMSEC << " ms\n";
1149     for (size_t j = 0; j < in.size(); j++) {
1150       in[j].SetData(nullptr);
1151     }
1152     if (!flags_->benchmark_data_file_.empty()) {
1153       auto status = CompareOutputForModelPool(&output);
1154       if (status != RET_OK) {
1155         model_parallel_runner_ret_failed_ = true;
1156         MS_LOG(ERROR) << "Compare output error " << status;
1157         return;
1158       }
1159     }
1160   }
1161 }
1162 
AddConfigInfo(const std::shared_ptr<RunnerConfig> & runner_config)1163 int BenchmarkUnifiedApi::AddConfigInfo(const std::shared_ptr<RunnerConfig> &runner_config) {
1164   if (!flags_->config_file_.empty()) {
1165     runner_config->SetConfigPath(flags_->config_file_);
1166   }
1167   std::map<std::string, std::string> config;
1168   if (flags_->enable_shared_thread_pool_) {
1169     config[kEnableSharedThreadPoolKey] = "true";
1170     if (!flags_->thread_num_limit_per_worker_.empty()) {
1171       config[kThreadNumLimitPerWorkerKey] = flags_->thread_num_limit_per_worker_;
1172     }
1173     if (!flags_->thread_num_remaining_per_worker_.empty()) {
1174       config[kThreadNumRemainingPerWorkerKey] = flags_->thread_num_remaining_per_worker_;
1175     }
1176   } else {
1177     config[kEnableSharedThreadPoolKey] = "false";
1178   }
1179   runner_config->SetConfigInfo(kSharedThreadPoolSection, config);
1180   return RET_OK;
1181 }
1182 
ParallelInference(std::shared_ptr<mindspore::Context> context)1183 int BenchmarkUnifiedApi::ParallelInference(std::shared_ptr<mindspore::Context> context) {
1184   if (flags_->warm_up_loop_count_ > kMaxRequestNum || flags_->parallel_num_ > kMaxRequestNum) {
1185     MS_LOG(WARNING) << "in parallel predict warm up loop count should less than" << kMaxRequestNum;
1186   }
1187 
1188   // model runner init
1189   auto runner_config = std::make_shared<RunnerConfig>();
1190   runner_config->SetContext(context);
1191   runner_config->SetWorkersNum(flags_->workers_num_);
1192   auto status = AddConfigInfo(runner_config);
1193   MS_CHECK_FALSE_MSG(status != kSuccess, RET_ERROR, "add config info for parallel predict failed.");
1194   auto model_init_start = GetTimeUs();
1195   auto ret = model_runner_.Init(flags_->model_file_, runner_config);
1196   MS_CHECK_FALSE_MSG(ret != kSuccess, RET_ERROR, "model pool init failed.");
1197   auto model_init_end = GetTimeUs();
1198 
1199   // load data
1200   ms_inputs_for_api_ = model_runner_.GetInputs();
1201   MS_CHECK_FALSE_MSG(ms_inputs_for_api_.empty(), RET_ERROR, "model pool input is empty.");
1202   ms_outputs_for_api_ = model_runner_.GetOutputs();
1203   MS_CHECK_FALSE_MSG(ms_outputs_for_api_.empty(), RET_ERROR, "model pool output is empty.");
1204 
1205   if (!flags_->graph_input_shape_map_.empty()) {
1206     // parse model input shapes from --inputShape flag
1207     std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(model_runner_.GetInputs());
1208     MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize dims empty.");
1209     (void)std::transform(resize_dims.begin(), resize_dims.end(), std::back_inserter(resize_dims_),
1210                          [&](const auto &shapes) { return shapes; });
1211   } else {
1212     (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims_),
1213                          [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1214   }
1215 
1216   for (int i = 0; i < flags_->parallel_num_ + flags_->warm_up_loop_count_; i++) {
1217     status = LoadInput();
1218     MS_CHECK_FALSE_MSG(status != RET_OK, status, "Generate input data error");
1219     std::vector<MSTensor> output;
1220     all_outputs_.push_back(output);
1221   }
1222   if (!flags_->benchmark_data_file_.empty()) {
1223     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1224       auto &tensor = ms_inputs_for_api_[i];
1225       tensor.SetShape(resize_dims_[i]);
1226       tensor.SetData(all_inputs_data_[0][i], false);
1227     }
1228     status = PrintInputData();
1229     MS_CHECK_FALSE_MSG(status != RET_OK, status, "PrintInputData error ");
1230     status = ReadCalibData();
1231     MS_CHECK_FALSE_MSG(status != RET_OK, status, "ReadCalibData error ");
1232   }
1233 
1234   // warm up
1235   std::vector<std::thread> model_thread_warm_up;
1236   for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
1237     model_thread_warm_up.push_back(std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerWarmUp, this, i));
1238   }
1239   for (auto &warm_up_thread : model_thread_warm_up) {
1240     warm_up_thread.join();
1241   }
1242   if (model_parallel_runner_ret_failed_) {
1243     return RET_ERROR;
1244   }
1245   std::cout << "=============== end warm up ===============\n";
1246   // do loop count
1247   std::vector<std::thread> model_thread_run;
1248   for (int parallel_num_idx = 0; parallel_num_idx < flags_->parallel_num_; parallel_num_idx++) {
1249     model_thread_run.push_back(
1250       std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerRun, this, flags_->parallel_task_num_, parallel_num_idx));
1251   }
1252   auto start_run_time = lite::GetTimeUs();
1253   runner_run_start_ = true;
1254   for (auto &run_thread : model_thread_run) {
1255     run_thread.join();
1256   }
1257   auto end_run_time = lite::GetTimeUs();
1258   if (model_parallel_runner_ret_failed_) {
1259     return RET_ERROR;
1260   }
1261   std::cout << "=================================" << std::endl;
1262   std::cout << "parallel predict init time: " << (model_init_end - model_init_start) / kFloatMSEC << " ms\n";
1263   std::cout << "parallel predict all run time: " << (end_run_time - start_run_time) / kFloatMSEC << " ms\n";
1264   std::cout << "=================================" << std::endl;
1265   return RET_OK;
1266 }
1267 #endif
1268 
PrintOutputData()1269 int BenchmarkUnifiedApi::PrintOutputData() {
1270   for (size_t i = 0; i < ms_outputs_for_api_.size(); i++) {
1271     mindspore::MSTensor input = ms_outputs_for_api_[i];
1272     auto tensor_data_type = static_cast<int>(input.DataType());
1273 
1274     std::cout << "OutData " << i << ": ";
1275     if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1276       MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1277       continue;
1278     }
1279     if (tensor_data_type == TypeId::kObjectTypeString) {
1280       std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1281       size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1282       for (size_t j = 0; j < print_num; j++) {
1283         std::cout << output_strings[j] << std::endl;
1284       }
1285       continue;
1286     }
1287     size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1288     const void *in_data = input.MutableData();
1289     if (in_data == nullptr) {
1290       MS_LOG(ERROR) << "out_data is nullptr.";
1291       return RET_ERROR;
1292     }
1293 
1294     for (size_t j = 0; j < print_num; j++) {
1295       if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1296         std::cout << static_cast<const float *>(in_data)[j] << " ";
1297       } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1298         std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1299       } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1300         std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1301       } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1302         std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1303       } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1304         std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1305       } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1306         std::cout << static_cast<const bool *>(in_data)[j] << " ";
1307       } else {
1308         MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1309         return RET_ERROR;
1310       }
1311     }
1312     std::cout << std::endl;
1313   }
1314   return RET_OK;
1315 }
1316 
CompileGraph(mindspore::ModelType model_type,const std::shared_ptr<Context> & context,const std::string & model_name)1317 int BenchmarkUnifiedApi::CompileGraph(mindspore::ModelType model_type, const std::shared_ptr<Context> &context,
1318                                       const std::string &model_name) {
1319   Key dec_key;
1320   if (!flags_->decrypt_key_str_.empty()) {
1321     dec_key.len = lite::Hex2ByteArray(flags_->decrypt_key_str_, dec_key.key, kEncMaxLen);
1322     if (dec_key.len == 0) {
1323       MS_LOG(ERROR) << "dec_key.len == 0";
1324       return RET_INPUT_PARAM_INVALID;
1325     }
1326     flags_->decrypt_key_str_.clear();
1327   }
1328   Status ret;
1329   if (flags_->crypto_lib_path_.empty()) {
1330     ret = ms_model_.Build(flags_->model_file_, model_type, context);
1331   } else {
1332     ret =
1333       ms_model_.Build(flags_->model_file_, model_type, context, dec_key, flags_->dec_mode_, flags_->crypto_lib_path_);
1334   }
1335   memset(dec_key.key, 0, kEncMaxLen);
1336   if (ret != kSuccess) {
1337     MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
1338     std::cout << "ms_model_.Build failed while running ", model_name.c_str();
1339     return RET_ERROR;
1340   }
1341   return RET_OK;
1342 }
1343 
ParseGraphInputShapeMap(const std::vector<MSTensor> & inputs)1344 std::vector<std::vector<int64_t>> BenchmarkUnifiedApi::ParseGraphInputShapeMap(const std::vector<MSTensor> &inputs) {
1345   std::vector<std::vector<int64_t>> resize_dims;
1346   if (flags_->graph_input_shape_map_.size() != inputs.size()) {
1347     MS_LOG(ERROR) << "The number of inputs in the model does not match the parsed inputShape option. The model has ["
1348                   << inputs.size() << "] input(s), while the parsed inputShape has ["
1349                   << flags_->graph_input_shape_map_.size() << "] input(s).";
1350     return resize_dims;
1351   }
1352   for (auto &model_input : inputs) {
1353     if (flags_->graph_input_shape_map_.find(model_input.Name()) == flags_->graph_input_shape_map_.end()) {
1354       MS_LOG(ERROR) << "model input [" << model_input.Name()
1355                     << "] is not found in inputShape option, please double check";
1356       MS_LOG(ERROR) << "model input names are as follows:";
1357       for (auto &mod_input : inputs) {
1358         MS_LOG(ERROR) << mod_input.Name();
1359       }
1360       MS_LOG(ERROR) << "user input names are as follows:";
1361       for (auto &user_input : flags_->graph_input_shape_map_) {
1362         MS_LOG(ERROR) << user_input.first;
1363       }
1364       return resize_dims;
1365     } else {
1366       auto shapes = flags_->graph_input_shape_map_[model_input.Name()];
1367       resize_dims.push_back(this->ConverterToInt64Vector(shapes));
1368     }
1369   }
1370   return resize_dims;
1371 }
1372 
1373 #ifdef PARALLEL_INFERENCE
RunParallelBenchmark(std::shared_ptr<mindspore::Context> context)1374 int BenchmarkUnifiedApi::RunParallelBenchmark(std::shared_ptr<mindspore::Context> context) {
1375   if (flags_->resize_dims_.empty() && flags_->graph_input_shape_map_.empty()) {
1376     MS_LOG(ERROR) << "model input shapes should be provided when using parallel predict, please specify --inputShape";
1377     return RET_ERROR;
1378   }
1379   auto status = ParallelInference(context);
1380   MS_CHECK_FALSE_MSG(status != RET_OK, RET_ERROR, "run model pool failed.");
1381   return RET_OK;
1382 }
1383 #endif
1384 
RunBenchmark()1385 int BenchmarkUnifiedApi::RunBenchmark() {
1386   auto start_prepare_time = GetTimeUs();
1387 
1388   if (flags_->enable_gl_texture_) {
1389     if (!gl_runtime_.Init()) {
1390       MS_LOG(ERROR) << "opengl runtime init failed ";
1391       std::cerr << "opengl runtime init failed ";
1392       return RET_ERROR;
1393     }
1394   }
1395 
1396   // Load graph
1397   std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
1398   auto iter = ModelTypeMap.find(flags_->model_type_);
1399   if (iter == ModelTypeMap.end()) {
1400     MS_LOG(ERROR) << "model_type " << flags_->model_type_ << " is invalid.";
1401     std::cerr << "model_type " << flags_->model_type_ << " is invalid.";
1402     return RET_ERROR;
1403   }
1404   mindspore::ModelType model_type = iter->second;
1405 
1406   MS_LOG(INFO) << "start unified benchmark run";
1407   std::cout << "start unified benchmark run" << std::endl;
1408 
1409   auto context = std::make_shared<mindspore::Context>();
1410   if (context == nullptr) {
1411     MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
1412     std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
1413     return RET_ERROR;
1414   }
1415 
1416   auto status = InitMSContext(context);
1417   if (status != RET_OK) {
1418     MS_LOG(ERROR) << "InitMSContext failed while running " << model_name.c_str();
1419     std::cout << "InitMSContext failed while running " << model_name.c_str();
1420     return RET_ERROR;
1421   }
1422 
1423   (void)UpdateDistributionName(context, &flags_->model_file_);
1424   (void)UpdateDistributionName(context, &flags_->benchmark_data_file_);
1425   (void)UpdateDistributionName(context, &flags_->config_file_);
1426 
1427   if (!flags_->config_file_.empty()) {
1428     auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
1429     if (config_ret != kSuccess) {
1430       MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1431       std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1432     }
1433   }
1434 
1435   UpdateConfigInfo();
1436 #ifdef PARALLEL_INFERENCE
1437   if (flags_->enable_parallel_predict_) {
1438     MS_CHECK_FALSE_MSG(RunParallelBenchmark(context) != RET_OK, RET_ERROR, "run model pool failed.");
1439     return RET_OK;
1440   }
1441 #endif
1442 
1443   status = CompileGraph(model_type, context, model_name);
1444   MS_CHECK_FALSE_MSG(status != RET_OK, status, "Compile graph failed.");
1445   if (!flags_->graph_input_shape_map_.empty()) {
1446     std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(ms_model_.GetInputs());
1447     MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize_dims is empty");
1448     auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1449     if (ret != kSuccess) {
1450       MS_LOG(ERROR) << "Input tensor resize failed.";
1451       std::cout << "Input tensor resize failed.";
1452       return RET_ERROR;
1453     }
1454   } else if (!flags_->resize_dims_.empty()) {
1455     std::vector<std::vector<int64_t>> resize_dims;
1456     (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
1457                          [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1458 
1459     auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1460     if (ret != kSuccess) {
1461       MS_LOG(ERROR) << "Input tensor resize failed.";
1462       std::cout << "Input tensor resize failed.";
1463       return RET_ERROR;
1464     }
1465   }
1466 
1467   ms_inputs_for_api_ = ms_model_.GetInputs();
1468   ms_outputs_for_api_ = ms_model_.GetOutputs();
1469   auto end_prepare_time = GetTimeUs();
1470   MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
1471   std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;
1472 
1473   // Load input
1474   MS_LOG(INFO) << "start generate input data";
1475   status = LoadInput();
1476   if (status != RET_OK) {
1477     MS_LOG(ERROR) << "Generate input data error";
1478     return status;
1479   }
1480   return GetBenchmarkResult();
1481 }
1482 
GetBenchmarkResult()1483 int BenchmarkUnifiedApi::GetBenchmarkResult() {
1484   if (!flags_->benchmark_data_file_.empty()) {
1485     auto status = MarkAccuracy();
1486     if (status != RET_OK) {
1487       MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
1488       std::cout << "Run MarkAccuracy error: " << status << std::endl;
1489       return status;
1490     }
1491   } else {
1492     auto status = MarkPerformance();
1493     if (status != RET_OK) {
1494       MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
1495       std::cout << "Run MarkPerformance error: " << status << std::endl;
1496       return status;
1497     }
1498   }
1499   if (flags_->dump_tensor_data_) {
1500     std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
1501   }
1502   Status finalize_ret = ms_model_.Finalize();
1503   if (finalize_ret == kSuccess) {
1504     MS_LOG(INFO) << "Benchmark finalize executed success.";
1505   }
1506   return RET_OK;
1507 }
1508 
InitTimeProfilingCallbackParameter()1509 int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
1510   if (flags_->inter_op_parallel_num_ > 1) {
1511     // before callback
1512     ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1513                                      const std::vector<mindspore::MSTensor> &before_outputs,
1514                                      const MSCallBackParam &call_param) {
1515       if (before_inputs.empty()) {
1516         MS_LOG(INFO) << "The num of beforeInputs is empty";
1517       }
1518       if (before_outputs.empty()) {
1519         MS_LOG(INFO) << "The num of beforeOutputs is empty";
1520       }
1521       {
1522         std::lock_guard<std::mutex> _l(op_times_mutex_);
1523         if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1524           op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1525         }
1526         if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1527           op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1528         }
1529         op_start_times_by_name_[call_param.node_name] = GetTimeUs();
1530         op_call_times_total_++;
1531       }
1532       return true;
1533     };
1534 
1535     // after callback
1536     ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1537                                     const std::vector<mindspore::MSTensor> &after_outputs,
1538                                     const MSCallBackParam &call_param) {
1539       uint64_t opEnd = GetTimeUs();
1540 
1541       if (after_inputs.empty()) {
1542         MS_LOG(INFO) << "The num of after inputs is empty";
1543       }
1544       if (after_outputs.empty()) {
1545         MS_LOG(INFO) << "The num of after outputs is empty";
1546       }
1547       {
1548         std::lock_guard<std::mutex> _l(op_times_mutex_);
1549         float cost = static_cast<float>(opEnd - op_start_times_by_name_[call_param.node_name]) / kFloatMSEC;
1550         if (flags_->device_ == "GPU") {
1551           cost = static_cast<float>(call_param.execute_time);
1552         }
1553         op_cost_total_ += cost;
1554         op_times_by_type_[call_param.node_type].first++;
1555         op_times_by_type_[call_param.node_type].second += cost;
1556         op_times_by_name_[call_param.node_name].first++;
1557         op_times_by_name_[call_param.node_name].second += cost;
1558       }
1559       return true;
1560     };
1561   } else {
1562     // before callback
1563     ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1564                                      const std::vector<mindspore::MSTensor> &before_outputs,
1565                                      const MSCallBackParam &call_param) {
1566       if (before_inputs.empty()) {
1567         MS_LOG(INFO) << "The num of beforeInputs is empty";
1568       }
1569       if (before_outputs.empty()) {
1570         MS_LOG(INFO) << "The num of beforeOutputs is empty";
1571       }
1572       if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1573         op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1574       }
1575       if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1576         op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1577       }
1578 
1579       op_call_times_total_++;
1580       op_begin_ = GetTimeUs();
1581       return true;
1582     };
1583 
1584     // after callback
1585     ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1586                                     const std::vector<mindspore::MSTensor> &after_outputs,
1587                                     const MSCallBackParam &call_param) {
1588       uint64_t opEnd = GetTimeUs();
1589 
1590       if (after_inputs.empty()) {
1591         MS_LOG(INFO) << "The num of after inputs is empty";
1592       }
1593       if (after_outputs.empty()) {
1594         MS_LOG(INFO) << "The num of after outputs is empty";
1595       }
1596 
1597       float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
1598       if (flags_->device_ == "GPU") {
1599         cost = static_cast<float>(call_param.execute_time);
1600       }
1601       op_cost_total_ += cost;
1602       op_times_by_type_[call_param.node_type].first++;
1603       op_times_by_type_[call_param.node_type].second += cost;
1604       op_times_by_name_[call_param.node_name].first++;
1605       op_times_by_name_[call_param.node_name].second += cost;
1606       return true;
1607     };
1608   }
1609   return RET_OK;
1610 }
1611 
InitPerfProfilingCallbackParameter()1612 int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
1613 #ifndef ENABLE_ARM64
1614   MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
1615   return RET_ERROR;
1616 #else
1617   struct perf_event_attr pe, pe2;
1618   memset(&pe, 0, sizeof(struct perf_event_attr));
1619   memset(&pe2, 0, sizeof(struct perf_event_attr));
1620   pe.type = PERF_TYPE_HARDWARE;
1621   pe2.type = PERF_TYPE_HARDWARE;
1622   pe.size = sizeof(struct perf_event_attr);
1623   pe2.size = sizeof(struct perf_event_attr);
1624   pe.disabled = 1;
1625   pe2.disabled = 1;
1626   pe.exclude_kernel = 1;   // don't count kernel
1627   pe2.exclude_kernel = 1;  // don't count kernel
1628   pe.exclude_hv = 1;       // don't count hypervisor
1629   pe2.exclude_hv = 1;      // don't count hypervisor
1630   pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1631   pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1632   if (flags_->perf_event_ == "CACHE") {
1633     pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
1634     pe2.config = PERF_COUNT_HW_CACHE_MISSES;
1635   } else if (flags_->perf_event_ == "STALL") {
1636     pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
1637     pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
1638   } else {
1639     pe.config = PERF_COUNT_HW_CPU_CYCLES;
1640     pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
1641   }
1642   perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
1643   if (perf_fd == -1) {
1644     MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
1645     return RET_ERROR;
1646   }
1647   perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
1648   if (perf_fd2 == -1) {
1649     MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
1650     return RET_ERROR;
1651   }
1652   struct PerfCount zero;
1653   zero.value[0] = 0;
1654   zero.value[1] = 0;
1655   // before callback
1656   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1657                              const std::vector<mindspore::MSTensor> &before_outputs,
1658                              const MSCallBackParam &call_param) {
1659     if (before_inputs.empty()) {
1660       MS_LOG(INFO) << "The num of beforeInputs is empty";
1661     }
1662     if (before_outputs.empty()) {
1663       MS_LOG(INFO) << "The num of beforeOutputs is empty";
1664     }
1665     if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
1666       op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
1667     }
1668     if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
1669       op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
1670     }
1671 
1672     op_call_times_total_++;
1673     ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
1674     ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
1675     return true;
1676   };
1677 
1678   // after callback
1679   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1680                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1681     struct PerfResult res;
1682     ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
1683     if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
1684       MS_LOG(ERROR) << "Failed to read perf_fd";
1685       return false;
1686     }
1687 
1688     if (after_inputs.empty()) {
1689       MS_LOG(INFO) << "The num of after inputs is empty";
1690     }
1691     if (after_outputs.empty()) {
1692       MS_LOG(INFO) << "The num of after outputs is empty";
1693     }
1694     float cost1 = static_cast<float>(res.values[0].value);
1695     float cost2 = static_cast<float>(res.values[1].value);
1696     op_cost_total_ += cost1;
1697     op_cost2_total_ += cost2;
1698     op_perf_by_type_[call_param.node_type].first++;
1699     op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
1700     op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
1701     op_perf_by_name_[call_param.node_name].first++;
1702     op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
1703     op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
1704     return true;
1705   };
1706 #endif
1707   return RET_OK;
1708 }
1709 
1710 namespace {
1711 template <typename T>
DataToString(void * data,size_t data_number)1712 std::string DataToString(void *data, size_t data_number) {
1713   if (data == nullptr) {
1714     return "Data of tensor is nullptr";
1715   }
1716   std::ostringstream oss;
1717   auto casted_data = static_cast<T *>(data);
1718   for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
1719     oss << " " << casted_data[i];
1720   }
1721   return oss.str();
1722 }
1723 
DumpMSTensor(mindspore::MSTensor * tensor)1724 std::string DumpMSTensor(mindspore::MSTensor *tensor) {
1725   if (tensor == nullptr) {
1726     return "Tensor is nullptr";
1727   }
1728   std::ostringstream oss;
1729   oss << " DataType: " << static_cast<int>(tensor->DataType());
1730   oss << " Shape:";
1731   for (auto &dim : tensor->Shape()) {
1732     oss << " " << dim;
1733   }
1734   oss << std::endl << " Data:";
1735   switch (static_cast<int>(tensor->DataType())) {
1736     case kNumberTypeFloat32: {
1737       oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
1738     } break;
1739     case kNumberTypeFloat16: {
1740       oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1741     } break;
1742     case kNumberTypeInt32: {
1743       oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
1744     } break;
1745     case kNumberTypeInt16: {
1746       oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1747     } break;
1748     case kNumberTypeInt8: {
1749       oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
1750     } break;
1751     default:
1752       oss << "Unsupported data type to print";
1753       break;
1754   }
1755   return oss.str();
1756 }
1757 #ifndef BENCHMARK_CLIP_JSON
GenerateOutputFileName(mindspore::MSTensor * tensor,const std::string & op_name,const std::string & file_type,const size_t & idx)1758 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
1759                                    const std::string &file_type, const size_t &idx) {
1760   std::string file_name = op_name;
1761   auto pos = file_name.find_first_of('/');
1762   while (pos != std::string::npos) {
1763     file_name.replace(pos, 1, ".");
1764     pos = file_name.find_first_of('/');
1765   }
1766   file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
1767   for (const auto &dim : tensor->Shape()) {
1768     file_name += std::to_string(dim) + "_";
1769   }
1770   if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
1771     file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
1772   }
1773   auto tensor_format = tensor->format();
1774   if (kTensorFormatMap.find(tensor_format) != kTensorFormatMap.end()) {
1775     file_name += "_" + kTensorFormatMap.at(tensor_format) + ".bin";
1776   } else {
1777     file_name += +".bin";
1778   }
1779 
1780   return file_name;
1781 }
1782 #endif
1783 }  // namespace
1784 
InitPrintTensorDataCallbackParameter()1785 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
1786   // before callback
1787   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1788                              const std::vector<mindspore::MSTensor> &before_outputs,
1789                              const MSCallBackParam &call_param) { return true; };
1790 
1791   // after callback
1792   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1793                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1794     std::cout << "================================================================" << std::endl;
1795     std::cout << call_param.node_name << " inputs : " << std::endl;
1796     for (auto ms_tensor : after_inputs) {
1797       std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1798     }
1799     std::cout << "----------------------------------------------------------------" << std::endl;
1800     std::cout << call_param.node_name << " outputs : " << std::endl;
1801     for (auto ms_tensor : after_outputs) {
1802       std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1803     }
1804     std::cout << "================================================================" << std::endl;
1805     return true;
1806   };
1807   return RET_OK;
1808 }
InitDumpTensorDataCallbackParameter()1809 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
1810 #ifndef BENCHMARK_CLIP_JSON
1811   // before callback
1812   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1813                              const std::vector<mindspore::MSTensor> &before_outputs,
1814                              const MSCallBackParam &call_param) {
1815     auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1816     auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1817     auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1818     if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1819       if (input_output_mode == 0 || input_output_mode == 1) {
1820         for (size_t i = 0; i < before_inputs.size(); i++) {
1821           auto ms_tensor = before_inputs.at(i);
1822           auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
1823           auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1824           if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
1825             MS_LOG(ERROR) << "write tensor data to file failed.";
1826             return false;
1827           }
1828         }
1829       }
1830     }
1831     return true;
1832   };
1833 
1834   // after callback
1835   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1836                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1837     auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1838     auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1839     auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1840     if (dump_mode == kDumpInputsAndOutputs ||
1841         std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1842       if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
1843         for (size_t i = 0; i < after_outputs.size(); i++) {
1844           auto ms_tensor = after_outputs.at(i);
1845           auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
1846           auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1847           if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
1848             MS_LOG(ERROR) << "write tensor data to file failed.";
1849             return false;
1850           }
1851         }
1852       }
1853     }
1854     return true;
1855   };
1856 #endif
1857   return RET_OK;
1858 }
1859 
~BenchmarkUnifiedApi()1860 BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {
1861 #ifdef PARALLEL_INFERENCE
1862   if (!flags_->enable_parallel_predict_) {
1863     return;
1864   }
1865   for (auto tensor : ms_inputs_for_api_) {
1866     auto data = tensor.MutableData();
1867     if (data != nullptr) {
1868       tensor.SetData(nullptr);
1869     }
1870   }
1871   for (auto &input : all_inputs_data_) {
1872     for (auto &data : input) {
1873       if (data != nullptr) {
1874         auto buf = static_cast<char *>(data);
1875         delete[] buf;
1876         data = nullptr;
1877       }
1878     }
1879   }
1880 #endif
1881 }
1882 }  // namespace lite
1883 }  // namespace mindspore
1884