• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "tools/benchmark/benchmark_unified_api.h"
18 #include <cinttypes>
19 #include <algorithm>
20 #include <utility>
21 #include <functional>
22 #include <iomanip>
23 #include <limits>
24 #include "src/common/common.h"
25 #include "src/tensor.h"
26 #include "tools/common/string_util.h"
27 #include "nnacl/nnacl_common.h"
28 #ifdef ENABLE_ARM64
29 #include <linux/perf_event.h>
30 #include <sys/ioctl.h>
31 #include <asm/unistd.h>
32 #include <unistd.h>
33 #endif
34 #ifdef SUPPORT_NNIE
35 #include "include/hi_common.h"
36 #include "include/hi_comm_vb.h"
37 #include "include/mpi_sys.h"
38 #include "include/mpi_vb.h"
39 #endif
40 #ifdef PARALLEL_INFERENCE
41 #include <thread>
42 #include "src/common/config_file.h"
43 #endif
44 #include "include/c_api/model_c.h"
45 #include "include/c_api/context_c.h"
46 
47 namespace mindspore {
48 constexpr size_t kDataToStringMaxNum = 40;
49 constexpr int kPrintDataNum = 20;
50 constexpr int kFrequencyDefault = 3;
51 constexpr int kPercentageDivisor = 100;
52 constexpr int kDumpInputsAndOutputs = 0;
53 constexpr int kDumpOutputs = 2;
54 #ifdef PARALLEL_INFERENCE
55 constexpr int kMaxRequestNum = 200;
56 #endif
57 namespace lite {
GenerateGLTexture(std::map<std::string,GLuint> * input_gl_texture)58 int BenchmarkUnifiedApi::GenerateGLTexture(std::map<std::string, GLuint> *input_gl_texture) {
59   for (auto tensor : ms_inputs_for_api_) {
60     float *input_data = reinterpret_cast<float *>(malloc(tensor.DataSize()));
61     if (input_data == nullptr) {
62       MS_LOG(ERROR) << "new input_data failed";
63       return RET_ERROR;
64     }
65     int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
66     if (status != RET_OK) {
67       free(input_data);
68       std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
69       MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
70       return status;
71     }
72     status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), input_data);
73     free(input_data);
74     if (status != RET_OK) {
75       MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
76       return status;
77     }
78   }
79 
80   return RET_OK;
81 }
82 
FillGLTextureToTensor(std::map<std::string,GLuint> * gl_texture,mindspore::MSTensor * tensor,std::string name,void * data)83 int BenchmarkUnifiedApi::FillGLTextureToTensor(std::map<std::string, GLuint> *gl_texture, mindspore::MSTensor *tensor,
84                                                std::string name, void *data) {
85   MS_CHECK_TRUE_MSG(gl_texture != nullptr, RET_ERROR, "gl_texture is nullptr");
86   MS_CHECK_TRUE_MSG(tensor != nullptr, RET_ERROR, "tensor is nullptr");
87 
88   auto image_id = 0;
89 
90   int width = 1, height = 1, channel = 1;
91   if (tensor->Shape().size() == DIMENSION_2D) {
92     height = tensor->Shape()[kNHWC_N];
93     channel = tensor->Shape()[kNHWC_H];
94   } else if (tensor->Shape().size() == DIMENSION_3D) {
95     width = tensor->Shape()[kNHWC_H];
96     height = tensor->Shape()[kNHWC_N];
97     channel = tensor->Shape()[kNHWC_C];
98   } else if (tensor->Shape().size() == DIMENSION_4D) {
99     width = tensor->Shape()[kNHWC_W];
100     height = tensor->Shape()[kNHWC_H];
101     channel = tensor->Shape()[kNHWC_C];
102   } else {
103     MS_LOG(ERROR) << "the tensor shape is not support";
104     return RET_ERROR;
105   }
106 
107   if (data == nullptr) {
108     image_id = gl_runtime_.GLCreateTexture(width, height, channel);
109   } else {
110     image_id = gl_runtime_.CopyHostToDeviceTexture(data, width, height, channel);
111   }
112 
113   if (image_id != GL_NONE) {
114     gl_texture->insert(std::pair<std::string, GLuint>(name, image_id));
115   } else {
116     MS_LOG(ERROR) << "glMemPool CopyHostToDeviceTexture failed";
117   }
118   return RET_OK;
119 }
120 
LoadAndBindGLTexture()121 int BenchmarkUnifiedApi::LoadAndBindGLTexture() {
122   std::map<std::string, GLuint> input_gl_texture;
123   std::map<std::string, GLuint> output_gl_texture;
124 
125   if (flags_->in_data_file_.empty()) {
126     auto status = GenerateGLTexture(&input_gl_texture);
127     if (status != RET_OK) {
128       std::cerr << "Generate input GLTexture error " << status << std::endl;
129       MS_LOG(ERROR) << "Generate input GLTexture error " << status;
130       return status;
131     }
132   } else {
133     auto status = ReadGLTextureFile(&input_gl_texture);
134     if (status != RET_OK) {
135       std::cerr << "ReadGLTextureFile error, " << status << std::endl;
136       MS_LOG(ERROR) << "ReadGLTextureFile error, " << status;
137       return status;
138     }
139   }
140 
141   for (auto &tensor : ms_outputs_for_api_) {
142     auto status = FillGLTextureToTensor(&output_gl_texture, &tensor, tensor.Name());
143     if (status != RET_OK) {
144       MS_LOG(ERROR) << "Fill GLTexture to output tensor" << status;
145       return status;
146     }
147   }
148 
149   auto status = ms_model_.BindGLTexture2DMemory(input_gl_texture, &output_gl_texture);
150   if (status != kSuccess) {
151     MS_LOG(ERROR) << "BindGLTexture2DMemory failed";
152     return RET_ERROR;
153   }
154   return RET_OK;
155 }
156 
ReadGLTextureFile(std::map<std::string,GLuint> * input_gl_texture)157 int BenchmarkUnifiedApi::ReadGLTextureFile(std::map<std::string, GLuint> *input_gl_texture) {
158   if (ms_inputs_for_api_.empty()) {
159     return RET_OK;
160   }
161   if (this->flags_->in_data_type_ == kImage) {
162     MS_LOG(ERROR) << "Not supported image input";
163     return RET_ERROR;
164   } else {
165     for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
166       auto tensor = ms_inputs_for_api_.at(i);
167       size_t size;
168       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
169       if (bin_buf == nullptr) {
170         MS_LOG(ERROR) << "ReadFile return nullptr";
171         return RET_ERROR;
172       }
173       auto tensor_data_size = tensor.DataSize();
174       if (size != tensor_data_size) {
175         std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
176                   << std::endl;
177         MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
178         delete[] bin_buf;
179         return RET_ERROR;
180       }
181 
182       auto status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), bin_buf);
183       delete[] bin_buf;
184       if (status != RET_OK) {
185         MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
186         return status;
187       }
188     }
189   }
190 
191   return RET_OK;
192 }
193 
LoadInput()194 int BenchmarkUnifiedApi::LoadInput() {
195   if (flags_->enable_gl_texture_ == true) {
196     if (lite::BenchmarkUnifiedApi::LoadAndBindGLTexture() != RET_OK) {
197       MS_LOG(ERROR) << "Generate input GLTexture error";
198       return RET_ERROR;
199     }
200     return RET_OK;
201   }
202 
203   if (flags_->in_data_file_.empty()) {
204     auto status = GenerateInputData();
205     if (status != RET_OK) {
206       std::cerr << "Generate input data error " << status << std::endl;
207       MS_LOG(ERROR) << "Generate input data error " << status;
208       return status;
209     }
210   } else {
211     auto status = ReadInputFile();
212     if (status != RET_OK) {
213       std::cerr << "ReadInputFile error, " << status << std::endl;
214       MS_LOG(ERROR) << "ReadInputFile error, " << status;
215       return status;
216     }
217   }
218   return RET_OK;
219 }
220 
GenerateInputData()221 int BenchmarkUnifiedApi::GenerateInputData() {
222 #ifdef PARALLEL_INFERENCE
223   if (flags_->enable_parallel_predict_) {
224     std::vector<void *> inputs;
225     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
226       auto tensor_name = ms_inputs_for_api_[i].Name();
227       size_t size;
228       if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeFloat32)) {
229         size = sizeof(float);
230       } else if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeInt32)) {
231         size = sizeof(int32_t);
232       } else {
233         MS_LOG(ERROR) << "not support in model pool.";
234         return RET_ERROR;
235       }
236       for (size_t j = 0; j < flags_->resize_dims_[i].size(); j++) {
237         size *= flags_->resize_dims_[i][j];
238       }
239       void *input_data = new (std::nothrow) char[size];
240       if (input_data == nullptr) {
241         MS_LOG(ERROR) << "new input_data failed";
242         for (auto &data : inputs) {
243           auto buf = static_cast<char *>(data);
244           delete[] buf;
245           data = nullptr;
246         }
247         return RET_ERROR;
248       }
249       inputs.push_back(input_data);
250       int status = GenerateRandomData(size, input_data, static_cast<int>(ms_inputs_for_api_[i].DataType()));
251       if (status != RET_OK) {
252         MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
253         for (auto &data : inputs) {
254           auto buf = static_cast<char *>(data);
255           delete[] buf;
256           data = nullptr;
257         }
258         return status;
259       }
260     }
261     all_inputs_data_.push_back(inputs);
262     return RET_OK;
263   }
264 #endif
265   for (auto &tensor : ms_inputs_for_api_) {
266     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
267       MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
268       if (input == nullptr) {
269         std::cerr << "StringsToTensor failed" << std::endl;
270         MS_LOG(ERROR) << "StringsToTensor failed";
271         return RET_ERROR;
272       }
273       tensor = *input;
274       delete input;
275     } else {
276       auto input_data = tensor.MutableData();
277       if (input_data == nullptr) {
278         MS_LOG(ERROR) << "MallocData for inTensor failed";
279         return RET_ERROR;
280       }
281       int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
282       if (status != RET_OK) {
283         std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
284         MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
285         return status;
286       }
287     }
288   }
289   return RET_OK;
290 }
291 
UpdateConfigInfo()292 void BenchmarkUnifiedApi::UpdateConfigInfo() {
293 #define WIPE_DEEP_CONFIG_ENV '0'
294 #define WIPE_DEEP_CONFIG_VOCAB_SIZE "100"
295 #define WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE "40"
296 
297   auto env = std::getenv("BENCHMARK_UPDATE_CONFIG_ENV");
298   if (env == nullptr) {
299     return;
300   }
301   if (env[0] == WIPE_DEEP_CONFIG_ENV) {
302     ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheVocabSizeKey, WIPE_DEEP_CONFIG_VOCAB_SIZE));
303     ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheDeviceSizeKey, WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE));
304   }
305   return;
306 }
307 
ReadInputFile()308 int BenchmarkUnifiedApi::ReadInputFile() {
309 #ifdef PARALLEL_INFERENCE
310   if (flags_->enable_parallel_predict_) {
311     std::vector<void *> inputs;
312     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
313       size_t size;
314       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
315       if (bin_buf == nullptr) {
316         MS_LOG(ERROR) << "ReadFile return nullptr";
317         for (auto &data : inputs) {
318           auto buf = static_cast<char *>(data);
319           delete[] buf;
320           data = nullptr;
321         }
322         return RET_ERROR;
323       }
324       inputs.push_back(bin_buf);
325     }
326     all_inputs_data_.push_back(inputs);
327     return RET_OK;
328   }
329 #endif
330   if (ms_inputs_for_api_.empty()) {
331     return RET_OK;
332   }
333 
334   if (this->flags_->in_data_type_ == kImage) {
335     MS_LOG(ERROR) << "Not supported image input";
336     return RET_ERROR;
337   } else {
338     for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
339       auto &cur_tensor = ms_inputs_for_api_.at(i);
340       size_t size;
341       char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
342       if (bin_buf == nullptr) {
343         MS_LOG(ERROR) << "ReadFile return nullptr";
344         return RET_ERROR;
345       }
346       if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
347         std::string str(bin_buf, size);
348         MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
349         if (input == nullptr) {
350           std::cerr << "StringsToTensor failed" << std::endl;
351           MS_LOG(ERROR) << "StringsToTensor failed";
352           delete[] bin_buf;
353           return RET_ERROR;
354         }
355         cur_tensor = *input;
356       } else {
357         auto tensor_data_size = cur_tensor.DataSize();
358         if (size != tensor_data_size) {
359           std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
360                     << std::endl;
361           MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
362           delete[] bin_buf;
363           return RET_ERROR;
364         }
365         auto input_data = cur_tensor.MutableData();
366         if (input_data == nullptr) {
367           MS_LOG(ERROR) << "input_data is nullptr.";
368           delete[] bin_buf;
369           return RET_ERROR;
370         }
371         memcpy(input_data, bin_buf, tensor_data_size);
372       }
373       delete[] bin_buf;
374     }
375   }
376   return RET_OK;
377 }
378 
GetDataTypeByTensorName(const std::string & tensor_name)379 int BenchmarkUnifiedApi::GetDataTypeByTensorName(const std::string &tensor_name) {
380 #ifdef PARALLEL_INFERENCE
381   for (auto tensor : ms_outputs_for_api_) {
382     auto name = tensor.Name();
383     if (name == tensor_name) {
384       return static_cast<int>(tensor.DataType());
385     }
386   }
387   MS_LOG(ERROR) << "not find tensor name : " << tensor_name << " in model output.";
388   return static_cast<int>(DataType::kTypeUnknown);
389 #endif
390   return static_cast<int>(ms_model_.GetOutputByTensorName(tensor_name).DataType());
391 }
392 
UpdateDistributionName(const std::shared_ptr<mindspore::Context> & context,std::string * name)393 void BenchmarkUnifiedApi::UpdateDistributionName(const std::shared_ptr<mindspore::Context> &context,
394                                                  std::string *name) {
395   if (flags_->device_ != "GPU") {
396     return;
397   }
398 
399   if (name->size() == 0) {
400     return;
401   }
402 
403   if (context->MutableDeviceInfo().size() == 0) {
404     return;
405   }
406 
407   auto device_info = context->MutableDeviceInfo().front();
408   GPUDeviceInfo *gpu_info = reinterpret_cast<GPUDeviceInfo *>(device_info.get());
409   auto rank_id = gpu_info->GetRankID();
410   if (rank_id == 0) {
411     return;
412   }
413   gpu_info->SetDeviceID(rank_id);
414 
415   /* model file & benchmark data file: include .mindir
416    config file :  include .config */
417   auto replace_pos = name->find(".mindir");
418   if (replace_pos == std::string::npos) {
419     replace_pos = name->find(".config");
420   }
421 
422   if (replace_pos == std::string::npos) {
423     return;
424   }
425 
426   *name = name->replace(replace_pos, sizeof('.'), std::to_string(rank_id) + ".");
427 
428   MS_LOG(INFO) << "Update distribution info: " << *name;
429   std::cout << "Update distribution info: " << *name << std::endl;
430   return;
431 }
432 
InitMSContextForGPU(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)433 void BenchmarkUnifiedApi::InitMSContextForGPU(const std::shared_ptr<mindspore::Context> &context,
434                                               std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
435   std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
436   gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
437   uint32_t device_id = 0;
438   auto device_id_env = std::getenv("GPU_DEVICE_ID");
439   if (device_id_env != nullptr) {
440 //    try {
441       device_id = static_cast<uint32_t>(std::stoul(device_id_env));
442 //    } catch (std::invalid_argument &e) {
443 //      MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
444 //    }
445     MS_LOG(INFO) << "GPU device_id = " << device_id;
446   }
447   gpu_device_info->SetDeviceID(device_id);
448   if (flags_->device_id_ >= 0) {
449     gpu_device_info->SetDeviceID(flags_->device_id_);
450     MS_LOG(INFO) << "GPU device_id = " << flags_->device_id_;
451   }
452   if (flags_->enable_gl_texture_) {
453     gpu_device_info->SetEnableGLTexture(flags_->enable_gl_texture_);
454 
455     auto gl_context = eglGetCurrentContext();
456     gpu_device_info->SetGLContext(gl_context);
457 
458     auto gl_display = eglGetCurrentDisplay();
459     gpu_device_info->SetGLDisplay(gl_display);
460   } else {
461     gpu_device_info->SetProvider("tensorrt");
462     gpu_device_info->SetAllocator(nullptr);
463   }
464   device_list->push_back(gpu_device_info);
465 }
466 
InitMSContextForAscend(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)467 void BenchmarkUnifiedApi::InitMSContextForAscend(const std::shared_ptr<mindspore::Context> &context,
468                                                  std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
469   uint32_t device_id = 0;
470   auto device_id_env = std::getenv("ASCEND_DEVICE_ID");
471   if (device_id_env != nullptr) {
472 //    try {
473       device_id = static_cast<uint32_t>(std::stoul(device_id_env));
474 //    } catch (std::invalid_argument &e) {
475 //      MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
476 //    }
477     MS_LOG(INFO) << "Ascend device_id = " << device_id;
478   }
479   std::shared_ptr<AscendDeviceInfo> ascend_device_info = std::make_shared<AscendDeviceInfo>();
480   ascend_device_info->SetDeviceID(device_id);
481   ascend_device_info->SetProvider(flags_->provider_);
482   auto back_policy_env = std::getenv("ASCEND_BACK_POLICY");
483   if (back_policy_env != nullptr) {
484     ascend_device_info->SetProvider(back_policy_env);
485   }
486 #ifdef ENABLE_CLOUD_FUSION_INFERENCE
487   if (flags_->device_id_ >= 0 && flags_->rank_id_ >= 0) {
488     ascend_device_info->SetDeviceID(flags_->device_id_);
489     ascend_device_info->SetRankID(flags_->rank_id_);
490     ascend_device_info->SetProvider("ge");
491   }
492 #endif
493   device_list->push_back(ascend_device_info);
494 }
495 
InitMSContext(const std::shared_ptr<mindspore::Context> & context)496 int BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
497   context->SetThreadNum(flags_->num_threads_);
498   context->SetGroupInfoFile(flags_->group_info_file_);
499   context->SetThreadAffinity(flags_->cpu_bind_mode_);
500   context->SetInterOpParallelNum(flags_->inter_op_parallel_num_);
501   if (!flags_->core_list_.empty()) {
502     context->SetThreadAffinity(flags_->core_list_);
503   }
504 #ifndef ENABLE_CLOUD_FUSION_INFERENCE
505   if (flags_->delegate_mode_ == "CoreML") {
506     context->SetBuiltInDelegate(kCoreML);
507   } else if (flags_->delegate_mode_ == "NNAPI") {
508     context->SetBuiltInDelegate(kNNAPI);
509   }
510   context->SetEnableParallel(flags_->enable_parallel_);
511 #endif
512 
513   auto &device_list = context->MutableDeviceInfo();
514   if (flags_->device_ == "GPU" || flags_->device_ == "Auto") {
515     InitMSContextForGPU(context, &device_list);
516   }
517 
518   if (flags_->device_ == "NPU" || flags_->device_ == "Auto") {
519     std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
520     npu_device_info->SetEnableFP16(flags_->enable_fp16_);
521     npu_device_info->SetFrequency(kFrequencyDefault);
522     device_list.push_back(npu_device_info);
523   }
524 
525   if (flags_->device_ == "Ascend" || flags_->device_ == "Auto") {
526     MS_LOG(ERROR) << "OHOS not support Ascend devices.";
527     return RET_NOT_SUPPORT;
528   }
529 
530   if (flags_->device_ == "NNRT" || flags_->device_ == "Auto") {
531     std::shared_ptr<NNRTDeviceInfo> nnrt_device_info = std::make_shared<NNRTDeviceInfo>();
532     size_t num = 0;
533     auto descs = OH_AI_GetAllNNRTDeviceDescs(&num);
534     NNRTDeviceDesc *desc_nnrt = nullptr;
535     for (size_t i = 0; i < num; i++) {
536       auto desc = OH_AI_GetElementOfNNRTDeviceDescs(descs, i);
537       auto name = OH_AI_GetNameFromNNRTDeviceDesc(desc);
538       if (strncmp(name, "NPU_", 4) == 0 ) {  // npu推理,在线编译
539         desc_nnrt = desc;
540         break;
541       }
542     }
543     if (desc_nnrt == nullptr) {
544       BENCHMARK_LOG_ERROR("nnrt desc get failed");
545       return RET_ERROR;
546     }
547     auto id = OH_AI_GetDeviceIdFromNNRTDeviceDesc(desc_nnrt);
548     nnrt_device_info->SetDeviceID(id);
549     OH_AI_DestroyAllNNRTDeviceDescs(&descs);
550     device_list.push_back(nnrt_device_info);
551   }
552 
553   // CPU priority is behind GPU and NPU
554   std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
555   device_info->SetEnableFP16(flags_->enable_fp16_);
556   device_info->SetProvider(flags_->provider_);
557   device_list.push_back(device_info);
558 
559   return RET_OK;
560 }
561 #ifdef PARALLEL_INFERENCE
CompareOutputForModelPool(std::vector<mindspore::MSTensor> * outputs)562 int BenchmarkUnifiedApi::CompareOutputForModelPool(std::vector<mindspore::MSTensor> *outputs) {
563   if (outputs->empty()) {
564     MS_LOG(ERROR) << "outputs is empty.";
565     return RET_ERROR;
566   }
567   std::cout << "================ Comparing Output data ================" << std::endl;
568   float total_bias = 0;
569   int total_size = 0;
570   // check the output tensor name.
571   for (size_t i = 0; i < outputs->size(); i++) {
572     std::string tensor_name = outputs->at(i).Name();
573     mindspore::MSTensor tensor = outputs->at(i);
574     if (tensor == nullptr) {
575       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
576       return RET_ERROR;
577     }
578     constexpr float kParallelRelative = 1e-7;
579     constexpr float kParallelAbsolute = 1e-10;
580     int ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size, kParallelRelative,
581                                              kParallelAbsolute);
582     if (ret != RET_OK) {
583       MS_LOG(ERROR) << "Error in CompareData";
584       std::cerr << "Error in CompareData" << std::endl;
585       std::cout << "=======================================================" << std::endl << std::endl;
586       return ret;
587     }
588   }
589   float mean_bias;
590   if (total_size != 0) {
591     mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
592   } else {
593     mean_bias = 0;
594   }
595 
596   std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
597   std::cout << "=======================================================" << std::endl << std::endl;
598 
599   if (mean_bias > this->flags_->accuracy_threshold_) {
600     MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
601     std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
602     return RET_ERROR;
603   }
604   return RET_OK;
605 }
606 #endif
607 
Convert2Float32(float * __restrict out,const uint16_t in)608 void Convert2Float32(float *__restrict out, const uint16_t in) {
609   uint32_t t1;
610   uint32_t t2;
611   uint32_t t3;
612 
613   t1 = in & 0x7fffu;
614   t2 = in & 0x8000u;
615   t3 = in & 0x7c00u;
616 
617   t1 <<= 13u;
618   t2 <<= 16u;
619 
620   t1 += 0x38000000;
621 
622   t1 = (t3 == 0 ? 0 : t1);
623 
624   t1 |= t2;
625 
626   *(out) = static_cast<float>(t1);
627 }
628 
629 namespace {
630 template <typename T>
VectorValueCompare(const std::vector<T> & vec1,const std::vector<T> & vec2)631 bool VectorValueCompare(const std::vector<T> &vec1, const std::vector<T> &vec2) {
632   if (vec1.size() != vec2.size()) {
633     return false;
634   }
635   for (auto &ele : vec1) {
636     if (!IsContain(vec2, ele)) {
637       return false;
638     }
639   }
640   return true;
641 }
642 }  // namespace
643 
CompareOutput()644 int BenchmarkUnifiedApi::CompareOutput() {
645   std::cout << "================ Comparing Output data ================" << std::endl;
646   float total_bias = 0;
647   int total_size = 0;
648   // check the output tensor name.
649   if (!VectorValueCompare(this->benchmark_tensor_names_, ms_model_.GetOutputTensorNames())) {
650     MS_LOG(ERROR) << "The output tensor name is wrong.";
651     return RET_ERROR;
652   }
653   for (const auto &calib_tensor : benchmark_data_) {
654     std::string tensor_name = calib_tensor.first;
655     mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
656     if (tensor == nullptr) {
657       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
658       return RET_ERROR;
659     }
660     int ret;
661     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
662       std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
663       ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
664     } else {
665       if (flags_->enable_gl_texture_) {
666         auto *gltexture_id = reinterpret_cast<GLuint *>(tensor.MutableData());
667         if (gltexture_id == nullptr) {
668           MS_LOG(ERROR) << "get gltexture_id failed";
669           return RET_ERROR;
670         }
671         auto tmp = gl_runtime_.CopyDeviceTextureToHost(*gltexture_id);
672         if (tmp == nullptr) {
673           MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
674           return RET_ERROR;
675         }
676         float *hostptr = reinterpret_cast<float *>(tmp);
677 
678         auto tensor_shape = tensor.Shape();
679         auto data_len =
680           std::accumulate(tensor_shape.begin(), tensor_shape.end(), sizeof(float), std::multiplies<size_t>());
681         auto *new_tensor = new (std::nothrow)
682           MSTensor(tensor_name, mindspore::DataType::kNumberTypeFloat32, tensor_shape, hostptr, data_len);
683         MS_CHECK_TRUE_MSG(new_tensor != nullptr, RET_ERROR, "new tensor failed");
684         if (new_tensor->MutableData() == nullptr) {
685           MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
686           delete new_tensor;
687           return RET_ERROR;
688         }
689         ret = CompareDataGetTotalBiasAndSize(tensor_name, new_tensor, &total_bias, &total_size);
690         delete new_tensor;
691       } else {
692         ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
693       }
694     }
695     if (ret != RET_OK) {
696       MS_LOG(ERROR) << "Error in CompareData";
697       std::cerr << "Error in CompareData" << std::endl;
698       std::cout << "=======================================================" << std::endl << std::endl;
699       return ret;
700     }
701   }
702   float mean_bias;
703   if (total_size != 0) {
704     mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
705   } else {
706     mean_bias = 0;
707   }
708 
709   std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
710   std::cout << "=======================================================" << std::endl << std::endl;
711 
712   if (mean_bias > this->flags_->accuracy_threshold_) {
713     MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
714     std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
715     return RET_ERROR;
716   }
717   return RET_OK;
718 }
719 
CompareOutputByCosineDistance(float cosine_distance_threshold)720 int BenchmarkUnifiedApi::CompareOutputByCosineDistance(float cosine_distance_threshold) {
721   std::cout << "================ Comparing Output data ================" << std::endl;
722   float total_cosine_distance = 0;
723   int total_size = 0;
724   // check the output tensor name.
725   if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
726     MS_LOG(ERROR) << "The output tensor name is wrong.";
727     return RET_ERROR;
728   }
729   for (const auto &calib_tensor : benchmark_data_) {
730     std::string tensor_name = calib_tensor.first;
731     mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
732     if (tensor == nullptr) {
733       MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
734       return RET_ERROR;
735     }
736     int ret;
737     if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
738       std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
739       ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
740     } else {
741       ret = CompareDataGetTotalCosineDistanceAndSize(tensor_name, &tensor, &total_cosine_distance, &total_size);
742     }
743     if (ret != RET_OK) {
744       MS_LOG(ERROR) << "Error in CompareData";
745       std::cerr << "Error in CompareData" << std::endl;
746       std::cout << "=======================================================" << std::endl << std::endl;
747       return ret;
748     }
749   }
750   float mean_cosine_distance;
751   if (total_size != 0) {
752     mean_cosine_distance = total_cosine_distance / float_t(total_size);
753   } else {
754     mean_cosine_distance = CosineErrMaxVal;
755   }
756   mean_cosine_distance = 1 - mean_cosine_distance;
757   std::cout << "Cosine distance of all nodes/tensors: " << std::setprecision(std::numeric_limits<double>::digits10)
758             << mean_cosine_distance << std::endl;
759   std::cout << "=======================================================" << std::endl << std::endl;
760 
761   if (mean_cosine_distance < cosine_distance_threshold) {
762     MS_LOG(ERROR) << "cosine distance of all nodes/tensors is too small: " << mean_cosine_distance;
763     std::cerr << "Mean cosine distance of all nodes/tensors is too small: " << mean_cosine_distance << std::endl;
764     return RET_ERROR;
765   }
766   return RET_OK;
767 }
768 
CompareDataGetTotalBiasAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_bias,int * total_size,float relative_tolerance,float absolute_tolerance)769 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
770                                                         float *total_bias, int *total_size, float relative_tolerance,
771                                                         float absolute_tolerance) {
772   float bias = 0;
773   auto mutableData = tensor->MutableData();
774   if (mutableData == nullptr) {
775     MS_LOG(ERROR) << "mutableData is nullptr.";
776     return RET_ERROR;
777   }
778   switch (static_cast<int>(tensor->DataType())) {
779     case TypeId::kNumberTypeFloat:
780     case TypeId::kNumberTypeFloat32: {
781       bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
782       break;
783     }
784     case TypeId::kNumberTypeInt8: {
785       bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
786       break;
787     }
788     case TypeId::kNumberTypeUInt8: {
789       bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
790       break;
791     }
792     case TypeId::kNumberTypeInt32: {
793       bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
794       break;
795     }
796     case TypeId::kNumberTypeInt16: {
797       bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
798       break;
799     }
800     case TypeId::kNumberTypeBool: {
801       bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
802       break;
803     }
804     case TypeId::kNumberTypeFloat16: {
805       size_t shapeSize = 1;
806       for (int64_t dim : tensor->Shape()) {
807         if (dim <= 0) {
808           MS_LOG(ERROR) << "The shape of output " << name << " should be great than 0 after inference, got "
809                         << tensor->Shape();
810           return RET_ERROR;
811         }
812         MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
813         shapeSize *= static_cast<size_t>(dim);
814       }
815       auto *floatArr = new float[shapeSize];
816       for (size_t i = 0; i < shapeSize; ++i) {
817         uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
818         floatArr[i] = ShortToFloat32(tmpInt);
819       }
820       bias = CompareData<float, int64_t>(name, tensor->Shape(), floatArr);
821       delete[] floatArr;
822       break;
823     }
824     default:
825       MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
826       return RET_ERROR;
827   }
828   if (bias < 0) {
829     MS_LOG(ERROR) << "CompareData failed, name: " << name;
830     return RET_ERROR;
831   }
832   *total_bias += bias;
833   *total_size += 1;
834   return RET_OK;
835 }
CompareDataGetTotalCosineDistanceAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_cosine_distance,int * total_size)836 int BenchmarkUnifiedApi::CompareDataGetTotalCosineDistanceAndSize(const std::string &name, mindspore::MSTensor *tensor,
837                                                                   float *total_cosine_distance, int *total_size) {
838   if (tensor == nullptr) {
839     MS_LOG(ERROR) << "tensor is nullptr.";
840     return RET_ERROR;
841   }
842   if (total_cosine_distance == nullptr) {
843     MS_LOG(ERROR) << "total_cosine_distance is nullptr.";
844     return RET_ERROR;
845   }
846   if (total_size == nullptr) {
847     MS_LOG(ERROR) << "total_size is nullptr.";
848     return RET_ERROR;
849   }
850   float bias = 0;
851   auto mutableData = tensor->MutableData();
852   if (mutableData == nullptr) {
853     MS_LOG(ERROR) << "mutableData is nullptr.";
854     return RET_ERROR;
855   }
856   int res = RET_OK;
857   switch (static_cast<int>(tensor->DataType())) {
858     case TypeId::kNumberTypeFloat:
859     case TypeId::kNumberTypeFloat32: {
860       res = CompareDatabyCosineDistance<float>(name, tensor->Shape(), mutableData, &bias);
861       break;
862     }
863     case TypeId::kNumberTypeFloat16: {
864       size_t shapeSize = 1;
865       for (int64_t dim : tensor->Shape()) {
866         if (dim <= 0) {
867           MS_LOG(ERROR) << "Invalid shape.";
868           return RET_ERROR;
869         }
870         MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
871         shapeSize *= static_cast<size_t>(dim);
872       }
873       float *floatArr = new float[shapeSize];
874       for (size_t i = 0; i < shapeSize; ++i) {
875         uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
876         Convert2Float32(&floatArr[i], tmpInt);
877         reinterpret_cast<float *>(mutableData)[i] = floatArr[i];
878       }
879       delete[] floatArr;
880       bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
881       break;
882     }
883     case TypeId::kNumberTypeInt8: {
884       res = CompareDatabyCosineDistance<int8_t>(name, tensor->Shape(), mutableData, &bias);
885       break;
886     }
887     case TypeId::kNumberTypeUInt8: {
888       res = CompareDatabyCosineDistance<uint8_t>(name, tensor->Shape(), mutableData, &bias);
889       break;
890     }
891     case TypeId::kNumberTypeInt32: {
892       res = CompareDatabyCosineDistance<int32_t>(name, tensor->Shape(), mutableData, &bias);
893       break;
894     }
895     case TypeId::kNumberTypeInt16: {
896       res = CompareDatabyCosineDistance<int16_t>(name, tensor->Shape(), mutableData, &bias);
897       break;
898     }
899     case TypeId::kNumberTypeBool: {
900       res = CompareDatabyCosineDistance<bool>(name, tensor->Shape(), mutableData, &bias);
901       break;
902     }
903     default:
904       MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
905       return RET_ERROR;
906   }
907   if (res != RET_OK) {
908     MS_LOG(ERROR) << "CompareData failed, name: " << name;
909     return RET_ERROR;
910   }
911   *total_cosine_distance += 1 - bias;
912   *total_size += 1;
913   return RET_OK;
914 }
915 
MarkPerformance()916 int BenchmarkUnifiedApi::MarkPerformance() {
917   MS_LOG(INFO) << "Running warm up loops...";
918   std::cout << "Running warm up loops..." << std::endl;
919   std::vector<MSTensor> outputs;
920   for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
921     auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
922     if (status != kSuccess) {
923       MS_LOG(ERROR) << "Inference error ";
924       std::cerr << "Inference error " << std::endl;
925       return RET_ERROR;
926     }
927   }
928 
929   MS_LOG(INFO) << "Running benchmark loops...";
930   std::cout << "Running benchmark loops..." << std::endl;
931   uint64_t time_min = UINT64_MAX;
932   uint64_t time_max = 0;
933   uint64_t time_avg = 0;
934 
935   for (int i = 0; i < flags_->loop_count_; i++) {
936     auto inputs = ms_model_.GetInputs();
937     for (auto tensor : inputs) {
938       tensor.MutableData();  // prepare data
939     }
940     auto start = GetTimeUs();
941     auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
942     if (status != kSuccess) {
943       MS_LOG(ERROR) << "Inference error ";
944       std::cerr << "Inference error ";
945       return RET_ERROR;
946     }
947 
948     auto end = GetTimeUs();
949     auto time = end - start;
950     time_min = std::min(time_min, time);
951     time_max = std::max(time_max, time);
952     time_avg += time;
953   }
954 
955   if (flags_->time_profiling_) {
956     const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
957     const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
958     (void)PrintResult(per_op_name, op_times_by_name_);
959     (void)PrintResult(per_op_type, op_times_by_type_);
960 #ifdef ENABLE_ARM64
961   } else if (flags_->perf_profiling_) {
962     if (flags_->perf_event_ == "CACHE") {
963       const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
964       const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
965       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
966       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
967     } else if (flags_->perf_event_ == "STALL") {
968       const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
969                                                     "backendend(%)"};
970       const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
971                                                     "backendend(%)"};
972       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
973       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
974     } else {
975       const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
976       const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
977       (void)PrintPerfResult(per_op_name, op_perf_by_name_);
978       (void)PrintPerfResult(per_op_type, op_perf_by_type_);
979     }
980 #endif
981   }
982 
983   if (flags_->loop_count_ > 0) {
984     time_avg /= static_cast<size_t>(flags_->loop_count_);
985     MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
986                  << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
987                  << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
988     printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
989            flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
990            time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
991   }
992   return RET_OK;
993 }
994 
MarkAccuracy()995 int BenchmarkUnifiedApi::MarkAccuracy() {
996   MS_LOG(INFO) << "MarkAccuracy";
997   std::cout << "MarkAccuracy" << std::endl;
998 
999   int status = 0;
1000   if (flags_->enable_gl_texture_) {
1001     for (auto in_tensor : ms_inputs_for_api_) {
1002       auto *input = reinterpret_cast<GLuint *>(in_tensor.MutableData());
1003       if (input == nullptr) {
1004         MS_LOG(ERROR) << "get input data failed";
1005         return RET_ERROR;
1006       }
1007       float *hostptr = reinterpret_cast<float *>(gl_runtime_.CopyDeviceTextureToHost(*input));
1008       size_t print_num = 20;
1009       gl_runtime_.PrintImage2DData(hostptr, 1, 1, print_num);
1010     }
1011   } else {
1012     status = PrintInputData();
1013     if (status != RET_OK) {
1014       MS_LOG(ERROR) << "PrintInputData error " << status;
1015       std::cerr << "PrintInputData error " << status << std::endl;
1016       return status;
1017     }
1018   }
1019   std::vector<MSTensor> outputs;
1020   auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
1021   if (ret != kSuccess) {
1022     MS_LOG(ERROR) << "Inference error ";
1023     std::cerr << "Inference error " << std::endl;
1024     return RET_ERROR;
1025   }
1026   status = ReadCalibData();
1027   if (status != RET_OK) {
1028     MS_LOG(ERROR) << "Read calib data error " << status;
1029     std::cerr << "Read calib data error " << status << std::endl;
1030     return status;
1031   }
1032   status = CompareOutput();
1033   if (status != RET_OK) {
1034     MS_LOG(ERROR) << "Compare output error " << status;
1035     std::cerr << "Compare output error " << status << std::endl;
1036     return status;
1037   }
1038   if (this->flags_->cosine_distance_threshold_ >= -1) {
1039     status = CompareOutputByCosineDistance(this->flags_->cosine_distance_threshold_);
1040     if (status != RET_OK) {
1041       MS_LOG(ERROR) << "Compare output error by consine distance " << status;
1042       std::cerr << "Compare output error by consine distance" << status << std::endl;
1043       return status;
1044     }
1045   }
1046   return RET_OK;
1047 }
1048 
PrintInputData()1049 int BenchmarkUnifiedApi::PrintInputData() {
1050   for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1051     mindspore::MSTensor input = ms_inputs_for_api_[i];
1052     auto tensor_data_type = static_cast<int>(input.DataType());
1053 
1054     std::cout << "InData " << i << ": ";
1055     if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1056       MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1057       continue;
1058     }
1059     if (tensor_data_type == TypeId::kObjectTypeString) {
1060       std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1061       size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1062       for (size_t j = 0; j < print_num; j++) {
1063         std::cout << output_strings[j] << std::endl;
1064       }
1065       continue;
1066     }
1067     size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1068     const void *in_data = input.MutableData();
1069     if (in_data == nullptr) {
1070       MS_LOG(ERROR) << "in_data is nullptr.";
1071       return RET_ERROR;
1072     }
1073 
1074     for (size_t j = 0; j < print_num; j++) {
1075       if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1076         std::cout << static_cast<const float *>(in_data)[j] << " ";
1077       } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1078         std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1079       } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1080         std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1081       } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1082         std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1083       } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1084         std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1085       } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1086         std::cout << static_cast<const bool *>(in_data)[j] << " ";
1087       } else {
1088         MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1089         return RET_ERROR;
1090       }
1091     }
1092     std::cout << std::endl;
1093   }
1094   return RET_OK;
1095 }
1096 #ifdef PARALLEL_INFERENCE
ModelParallelRunnerWarmUp(int index)1097 void BenchmarkUnifiedApi::ModelParallelRunnerWarmUp(int index) {
1098   auto in = model_runner_.GetInputs();
1099   for (size_t i = 0; i < in.size(); i++) {
1100     in[i].SetShape(resize_dims_[i]);
1101     in[i].SetData(all_inputs_data_[index][i], false);
1102   }
1103   auto warm_up_start = GetTimeUs();
1104   std::vector<MSTensor> output;
1105   auto ret = model_runner_.Predict(in, &output);
1106   for (size_t j = 0; j < in.size(); j++) {
1107     in[j].SetData(nullptr);
1108   }
1109   if (ret != kSuccess) {
1110     model_parallel_runner_ret_failed_ = true;
1111     MS_LOG(ERROR) << "model pool predict failed.";
1112     return;
1113   }
1114   auto warm_up_end = GetTimeUs();
1115   std::cout << "warm up index: " << index << " | time: " << (warm_up_end - warm_up_start) / kFloatMSEC << " ms\n";
1116 }
1117 
ModelParallelRunnerRun(int task_num,int parallel_idx)1118 void BenchmarkUnifiedApi::ModelParallelRunnerRun(int task_num, int parallel_idx) {
1119   for (int i = 0; i < task_num || task_num == -1; i++) {
1120     while (!runner_run_start_) {
1121       continue;
1122     }
1123     int idx = parallel_idx + flags_->warm_up_loop_count_;
1124     auto in = model_runner_.GetInputs();
1125     if (idx >= static_cast<int>(all_inputs_data_.size())) {
1126       MS_LOG(ERROR) << "idx is to big :" << idx;
1127       return;
1128     }
1129     auto in_data = all_inputs_data_[idx];
1130     for (size_t tensor_index = 0; tensor_index < in.size(); tensor_index++) {
1131       in.at(tensor_index).SetShape(resize_dims_.at(tensor_index));
1132       in.at(tensor_index).SetData(all_inputs_data_.at(idx)[tensor_index], false);
1133     }
1134     auto predict_start = GetTimeUs();
1135     std::vector<MSTensor> output;
1136     auto ret = model_runner_.Predict(in, &output);
1137     if (ret != kSuccess) {
1138       model_parallel_runner_ret_failed_ = true;
1139       MS_LOG(ERROR) << "model pool predict failed.";
1140       for (auto &item : in) {
1141         item.SetData(nullptr);
1142       }
1143       return;
1144     }
1145     auto predict_end = GetTimeUs();
1146     std::cout << "parallel index: " << parallel_idx << " | task index: " << i
1147               << " | predict time: " << (predict_end - predict_start) / kFloatMSEC << " ms\n";
1148     for (size_t j = 0; j < in.size(); j++) {
1149       in[j].SetData(nullptr);
1150     }
1151     if (!flags_->benchmark_data_file_.empty()) {
1152       auto status = CompareOutputForModelPool(&output);
1153       if (status != RET_OK) {
1154         model_parallel_runner_ret_failed_ = true;
1155         MS_LOG(ERROR) << "Compare output error " << status;
1156         return;
1157       }
1158     }
1159   }
1160 }
1161 
AddConfigInfo(const std::shared_ptr<RunnerConfig> & runner_config)1162 int BenchmarkUnifiedApi::AddConfigInfo(const std::shared_ptr<RunnerConfig> &runner_config) {
1163   if (!flags_->config_file_.empty()) {
1164     runner_config->SetConfigPath(flags_->config_file_);
1165   }
1166   std::map<std::string, std::string> config;
1167   if (flags_->enable_shared_thread_pool_) {
1168     config[kEnableSharedThreadPoolKey] = "true";
1169     if (!flags_->thread_num_limit_per_worker_.empty()) {
1170       config[kThreadNumLimitPerWorkerKey] = flags_->thread_num_limit_per_worker_;
1171     }
1172     if (!flags_->thread_num_remaining_per_worker_.empty()) {
1173       config[kThreadNumRemainingPerWorkerKey] = flags_->thread_num_remaining_per_worker_;
1174     }
1175   } else {
1176     config[kEnableSharedThreadPoolKey] = "false";
1177   }
1178   runner_config->SetConfigInfo(kSharedThreadPoolSection, config);
1179   return RET_OK;
1180 }
1181 
ParallelInference(std::shared_ptr<mindspore::Context> context)1182 int BenchmarkUnifiedApi::ParallelInference(std::shared_ptr<mindspore::Context> context) {
1183   if (flags_->warm_up_loop_count_ > kMaxRequestNum || flags_->parallel_num_ > kMaxRequestNum) {
1184     MS_LOG(WARNING) << "in parallel predict warm up loop count should less than" << kMaxRequestNum;
1185   }
1186 
1187   // model runner init
1188   auto runner_config = std::make_shared<RunnerConfig>();
1189   runner_config->SetContext(context);
1190   runner_config->SetWorkersNum(flags_->workers_num_);
1191   auto status = AddConfigInfo(runner_config);
1192   MS_CHECK_FALSE_MSG(status != kSuccess, RET_ERROR, "add config info for parallel predict failed.");
1193   auto model_init_start = GetTimeUs();
1194   auto ret = model_runner_.Init(flags_->model_file_, runner_config);
1195   MS_CHECK_FALSE_MSG(ret != kSuccess, RET_ERROR, "model pool init failed.");
1196   auto model_init_end = GetTimeUs();
1197 
1198   // load data
1199   ms_inputs_for_api_ = model_runner_.GetInputs();
1200   MS_CHECK_FALSE_MSG(ms_inputs_for_api_.empty(), RET_ERROR, "model pool input is empty.");
1201   ms_outputs_for_api_ = model_runner_.GetOutputs();
1202   MS_CHECK_FALSE_MSG(ms_outputs_for_api_.empty(), RET_ERROR, "model pool output is empty.");
1203 
1204   if (!flags_->graph_input_shape_map_.empty()) {
1205     // parse model input shapes from --inputShape flag
1206     std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(model_runner_.GetInputs());
1207     MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize dims empty.");
1208     (void)std::transform(resize_dims.begin(), resize_dims.end(), std::back_inserter(resize_dims_),
1209                          [&](const auto &shapes) { return shapes; });
1210   } else {
1211     (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims_),
1212                          [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1213   }
1214 
1215   for (int i = 0; i < flags_->parallel_num_ + flags_->warm_up_loop_count_; i++) {
1216     status = LoadInput();
1217     MS_CHECK_FALSE_MSG(status != RET_OK, status, "Generate input data error");
1218     std::vector<MSTensor> output;
1219     all_outputs_.push_back(output);
1220   }
1221   if (!flags_->benchmark_data_file_.empty()) {
1222     for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1223       auto &tensor = ms_inputs_for_api_[i];
1224       tensor.SetShape(resize_dims_[i]);
1225       tensor.SetData(all_inputs_data_[0][i], false);
1226     }
1227     status = PrintInputData();
1228     MS_CHECK_FALSE_MSG(status != RET_OK, status, "PrintInputData error ");
1229     status = ReadCalibData();
1230     MS_CHECK_FALSE_MSG(status != RET_OK, status, "ReadCalibData error ");
1231   }
1232 
1233   // warm up
1234   std::vector<std::thread> model_thread_warm_up;
1235   for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
1236     model_thread_warm_up.push_back(std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerWarmUp, this, i));
1237   }
1238   for (auto &warm_up_thread : model_thread_warm_up) {
1239     warm_up_thread.join();
1240   }
1241   if (model_parallel_runner_ret_failed_) {
1242     return RET_ERROR;
1243   }
1244   std::cout << "=============== end warm up ===============\n";
1245   // do loop count
1246   std::vector<std::thread> model_thread_run;
1247   for (int parallel_num_idx = 0; parallel_num_idx < flags_->parallel_num_; parallel_num_idx++) {
1248     model_thread_run.push_back(
1249       std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerRun, this, flags_->parallel_task_num_, parallel_num_idx));
1250   }
1251   auto start_run_time = lite::GetTimeUs();
1252   runner_run_start_ = true;
1253   for (auto &run_thread : model_thread_run) {
1254     run_thread.join();
1255   }
1256   auto end_run_time = lite::GetTimeUs();
1257   if (model_parallel_runner_ret_failed_) {
1258     return RET_ERROR;
1259   }
1260   std::cout << "=================================" << std::endl;
1261   std::cout << "parallel predict init time: " << (model_init_end - model_init_start) / kFloatMSEC << " ms\n";
1262   std::cout << "parallel predict all run time: " << (end_run_time - start_run_time) / kFloatMSEC << " ms\n";
1263   std::cout << "=================================" << std::endl;
1264   return RET_OK;
1265 }
1266 #endif
1267 
PrintOutputData()1268 int BenchmarkUnifiedApi::PrintOutputData() {
1269   for (size_t i = 0; i < ms_outputs_for_api_.size(); i++) {
1270     mindspore::MSTensor input = ms_outputs_for_api_[i];
1271     auto tensor_data_type = static_cast<int>(input.DataType());
1272 
1273     std::cout << "OutData " << i << ": ";
1274     if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1275       MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1276       continue;
1277     }
1278     if (tensor_data_type == TypeId::kObjectTypeString) {
1279       std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1280       size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1281       for (size_t j = 0; j < print_num; j++) {
1282         std::cout << output_strings[j] << std::endl;
1283       }
1284       continue;
1285     }
1286     size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1287     const void *in_data = input.MutableData();
1288     if (in_data == nullptr) {
1289       MS_LOG(ERROR) << "out_data is nullptr.";
1290       return RET_ERROR;
1291     }
1292 
1293     for (size_t j = 0; j < print_num; j++) {
1294       if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1295         std::cout << static_cast<const float *>(in_data)[j] << " ";
1296       } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1297         std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1298       } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1299         std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1300       } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1301         std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1302       } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1303         std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1304       } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1305         std::cout << static_cast<const bool *>(in_data)[j] << " ";
1306       } else {
1307         MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1308         return RET_ERROR;
1309       }
1310     }
1311     std::cout << std::endl;
1312   }
1313   return RET_OK;
1314 }
1315 
CompileGraph(mindspore::ModelType model_type,const std::shared_ptr<Context> & context,const std::string & model_name)1316 int BenchmarkUnifiedApi::CompileGraph(mindspore::ModelType model_type, const std::shared_ptr<Context> &context,
1317                                       const std::string &model_name) {
1318   Key dec_key;
1319   if (!flags_->decrypt_key_str_.empty()) {
1320     dec_key.len = lite::Hex2ByteArray(flags_->decrypt_key_str_, dec_key.key, kEncMaxLen);
1321     if (dec_key.len == 0) {
1322       MS_LOG(ERROR) << "dec_key.len == 0";
1323       return RET_INPUT_PARAM_INVALID;
1324     }
1325     flags_->decrypt_key_str_.clear();
1326   }
1327   Status ret;
1328   if (flags_->crypto_lib_path_.empty()) {
1329     ret = ms_model_.Build(flags_->model_file_, model_type, context);
1330   } else {
1331     ret =
1332       ms_model_.Build(flags_->model_file_, model_type, context, dec_key, flags_->dec_mode_, flags_->crypto_lib_path_);
1333   }
1334   memset(dec_key.key, 0, kEncMaxLen);
1335   if (ret != kSuccess) {
1336     MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
1337     std::cout << "ms_model_.Build failed while running ", model_name.c_str();
1338     return RET_ERROR;
1339   }
1340   return RET_OK;
1341 }
1342 
ParseGraphInputShapeMap(const std::vector<MSTensor> & inputs)1343 std::vector<std::vector<int64_t>> BenchmarkUnifiedApi::ParseGraphInputShapeMap(const std::vector<MSTensor> &inputs) {
1344   std::vector<std::vector<int64_t>> resize_dims;
1345   if (flags_->graph_input_shape_map_.size() != inputs.size()) {
1346     MS_LOG(ERROR) << "The number of inputs in the model does not match the parsed inputShape option. The model has ["
1347                   << inputs.size() << "] input(s), while the parsed inputShape has ["
1348                   << flags_->graph_input_shape_map_.size() << "] input(s).";
1349     return resize_dims;
1350   }
1351   for (auto &model_input : inputs) {
1352     if (flags_->graph_input_shape_map_.find(model_input.Name()) == flags_->graph_input_shape_map_.end()) {
1353       MS_LOG(ERROR) << "model input [" << model_input.Name()
1354                     << "] is not found in inputShape option, please double check";
1355       MS_LOG(ERROR) << "model input names are as follows:";
1356       for (auto &mod_input : inputs) {
1357         MS_LOG(ERROR) << mod_input.Name();
1358       }
1359       MS_LOG(ERROR) << "user input names are as follows:";
1360       for (auto &user_input : flags_->graph_input_shape_map_) {
1361         MS_LOG(ERROR) << user_input.first;
1362       }
1363       return resize_dims;
1364     } else {
1365       auto shapes = flags_->graph_input_shape_map_[model_input.Name()];
1366       resize_dims.push_back(this->ConverterToInt64Vector(shapes));
1367     }
1368   }
1369   return resize_dims;
1370 }
1371 
1372 #ifdef PARALLEL_INFERENCE
RunParallelBenchmark(std::shared_ptr<mindspore::Context> context)1373 int BenchmarkUnifiedApi::RunParallelBenchmark(std::shared_ptr<mindspore::Context> context) {
1374   if (flags_->resize_dims_.empty() && flags_->graph_input_shape_map_.empty()) {
1375     MS_LOG(ERROR) << "model input shapes should be provided when using parallel predict, please specify --inputShape";
1376     return RET_ERROR;
1377   }
1378   auto status = ParallelInference(context);
1379   MS_CHECK_FALSE_MSG(status != RET_OK, RET_ERROR, "run model pool failed.");
1380   return RET_OK;
1381 }
1382 #endif
1383 
RunBenchmark()1384 int BenchmarkUnifiedApi::RunBenchmark() {
1385   auto start_prepare_time = GetTimeUs();
1386 
1387   if (flags_->enable_gl_texture_) {
1388     if (!gl_runtime_.Init()) {
1389       MS_LOG(ERROR) << "opengl runtime init failed ";
1390       std::cerr << "opengl runtime init failed ";
1391       return RET_ERROR;
1392     }
1393   }
1394 
1395   // Load graph
1396   std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
1397   auto iter = ModelTypeMap.find(flags_->model_type_);
1398   if (iter == ModelTypeMap.end()) {
1399     MS_LOG(ERROR) << "model_type " << flags_->model_type_ << " is invalid.";
1400     std::cerr << "model_type " << flags_->model_type_ << " is invalid.";
1401     return RET_ERROR;
1402   }
1403   mindspore::ModelType model_type = iter->second;
1404 
1405   MS_LOG(INFO) << "start unified benchmark run";
1406   std::cout << "start unified benchmark run" << std::endl;
1407 
1408   auto context = std::make_shared<mindspore::Context>();
1409   if (context == nullptr) {
1410     MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
1411     std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
1412     return RET_ERROR;
1413   }
1414 
1415   auto status = InitMSContext(context);
1416   if (status != RET_OK) {
1417     MS_LOG(ERROR) << "InitMSContext failed while running " << model_name.c_str();
1418     std::cout << "InitMSContext failed while running " << model_name.c_str();
1419     return RET_ERROR;
1420   }
1421 
1422   (void)UpdateDistributionName(context, &flags_->model_file_);
1423   (void)UpdateDistributionName(context, &flags_->benchmark_data_file_);
1424   (void)UpdateDistributionName(context, &flags_->config_file_);
1425 
1426   if (!flags_->config_file_.empty()) {
1427     auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
1428     if (config_ret != kSuccess) {
1429       MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1430       std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1431     }
1432   }
1433 
1434   UpdateConfigInfo();
1435 #ifdef PARALLEL_INFERENCE
1436   if (flags_->enable_parallel_predict_) {
1437     MS_CHECK_FALSE_MSG(RunParallelBenchmark(context) != RET_OK, RET_ERROR, "run model pool failed.");
1438     return RET_OK;
1439   }
1440 #endif
1441 
1442   status = CompileGraph(model_type, context, model_name);
1443   MS_CHECK_FALSE_MSG(status != RET_OK, status, "Compile graph failed.");
1444   if (!flags_->graph_input_shape_map_.empty()) {
1445     std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(ms_model_.GetInputs());
1446     MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize_dims is empty");
1447     auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1448     if (ret != kSuccess) {
1449       MS_LOG(ERROR) << "Input tensor resize failed.";
1450       std::cout << "Input tensor resize failed.";
1451       return RET_ERROR;
1452     }
1453   } else if (!flags_->resize_dims_.empty()) {
1454     std::vector<std::vector<int64_t>> resize_dims;
1455     (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
1456                          [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1457 
1458     auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1459     if (ret != kSuccess) {
1460       MS_LOG(ERROR) << "Input tensor resize failed.";
1461       std::cout << "Input tensor resize failed.";
1462       return RET_ERROR;
1463     }
1464   }
1465 
1466   ms_inputs_for_api_ = ms_model_.GetInputs();
1467   ms_outputs_for_api_ = ms_model_.GetOutputs();
1468   auto end_prepare_time = GetTimeUs();
1469   MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
1470   std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;
1471 
1472   // Load input
1473   MS_LOG(INFO) << "start generate input data";
1474   status = LoadInput();
1475   if (status != RET_OK) {
1476     MS_LOG(ERROR) << "Generate input data error";
1477     return status;
1478   }
1479   return GetBenchmarkResult();
1480 }
1481 
GetBenchmarkResult()1482 int BenchmarkUnifiedApi::GetBenchmarkResult() {
1483   if (!flags_->benchmark_data_file_.empty()) {
1484     auto status = MarkAccuracy();
1485     if (status != RET_OK) {
1486       MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
1487       std::cout << "Run MarkAccuracy error: " << status << std::endl;
1488       return status;
1489     }
1490   } else {
1491     auto status = MarkPerformance();
1492     if (status != RET_OK) {
1493       MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
1494       std::cout << "Run MarkPerformance error: " << status << std::endl;
1495       return status;
1496     }
1497   }
1498   if (flags_->dump_tensor_data_) {
1499     std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
1500   }
1501   Status finalize_ret = ms_model_.Finalize();
1502   if (finalize_ret == kSuccess) {
1503     MS_LOG(INFO) << "Benchmark finalize executed success.";
1504   }
1505   return RET_OK;
1506 }
1507 
InitTimeProfilingCallbackParameter()1508 int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
1509   if (flags_->inter_op_parallel_num_ > 1) {
1510     // before callback
1511     ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1512                                      const std::vector<mindspore::MSTensor> &before_outputs,
1513                                      const MSCallBackParam &call_param) {
1514       if (before_inputs.empty()) {
1515         MS_LOG(INFO) << "The num of beforeInputs is empty";
1516       }
1517       if (before_outputs.empty()) {
1518         MS_LOG(INFO) << "The num of beforeOutputs is empty";
1519       }
1520       {
1521         std::lock_guard<std::mutex> _l(op_times_mutex_);
1522         if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1523           op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1524         }
1525         if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1526           op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1527         }
1528         op_start_times_by_name_[call_param.node_name] = GetTimeUs();
1529         op_call_times_total_++;
1530       }
1531       return true;
1532     };
1533 
1534     // after callback
1535     ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1536                                     const std::vector<mindspore::MSTensor> &after_outputs,
1537                                     const MSCallBackParam &call_param) {
1538       uint64_t opEnd = GetTimeUs();
1539 
1540       if (after_inputs.empty()) {
1541         MS_LOG(INFO) << "The num of after inputs is empty";
1542       }
1543       if (after_outputs.empty()) {
1544         MS_LOG(INFO) << "The num of after outputs is empty";
1545       }
1546       {
1547         std::lock_guard<std::mutex> _l(op_times_mutex_);
1548         float cost = static_cast<float>(opEnd - op_start_times_by_name_[call_param.node_name]) / kFloatMSEC;
1549         if (flags_->device_ == "GPU") {
1550           cost = static_cast<float>(call_param.execute_time);
1551         }
1552         op_cost_total_ += cost;
1553         op_times_by_type_[call_param.node_type].first++;
1554         op_times_by_type_[call_param.node_type].second += cost;
1555         op_times_by_name_[call_param.node_name].first++;
1556         op_times_by_name_[call_param.node_name].second += cost;
1557       }
1558       return true;
1559     };
1560   } else {
1561     // before callback
1562     ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1563                                      const std::vector<mindspore::MSTensor> &before_outputs,
1564                                      const MSCallBackParam &call_param) {
1565       if (before_inputs.empty()) {
1566         MS_LOG(INFO) << "The num of beforeInputs is empty";
1567       }
1568       if (before_outputs.empty()) {
1569         MS_LOG(INFO) << "The num of beforeOutputs is empty";
1570       }
1571       if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1572         op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1573       }
1574       if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1575         op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1576       }
1577 
1578       op_call_times_total_++;
1579       op_begin_ = GetTimeUs();
1580       return true;
1581     };
1582 
1583     // after callback
1584     ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1585                                     const std::vector<mindspore::MSTensor> &after_outputs,
1586                                     const MSCallBackParam &call_param) {
1587       uint64_t opEnd = GetTimeUs();
1588 
1589       if (after_inputs.empty()) {
1590         MS_LOG(INFO) << "The num of after inputs is empty";
1591       }
1592       if (after_outputs.empty()) {
1593         MS_LOG(INFO) << "The num of after outputs is empty";
1594       }
1595 
1596       float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
1597       if (flags_->device_ == "GPU") {
1598         cost = static_cast<float>(call_param.execute_time);
1599       }
1600       op_cost_total_ += cost;
1601       op_times_by_type_[call_param.node_type].first++;
1602       op_times_by_type_[call_param.node_type].second += cost;
1603       op_times_by_name_[call_param.node_name].first++;
1604       op_times_by_name_[call_param.node_name].second += cost;
1605       return true;
1606     };
1607   }
1608   return RET_OK;
1609 }
1610 
InitPerfProfilingCallbackParameter()1611 int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
1612 #ifndef ENABLE_ARM64
1613   MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
1614   return RET_ERROR;
1615 #else
1616   struct perf_event_attr pe, pe2;
1617   memset(&pe, 0, sizeof(struct perf_event_attr));
1618   memset(&pe2, 0, sizeof(struct perf_event_attr));
1619   pe.type = PERF_TYPE_HARDWARE;
1620   pe2.type = PERF_TYPE_HARDWARE;
1621   pe.size = sizeof(struct perf_event_attr);
1622   pe2.size = sizeof(struct perf_event_attr);
1623   pe.disabled = 1;
1624   pe2.disabled = 1;
1625   pe.exclude_kernel = 1;   // don't count kernel
1626   pe2.exclude_kernel = 1;  // don't count kernel
1627   pe.exclude_hv = 1;       // don't count hypervisor
1628   pe2.exclude_hv = 1;      // don't count hypervisor
1629   pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1630   pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1631   if (flags_->perf_event_ == "CACHE") {
1632     pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
1633     pe2.config = PERF_COUNT_HW_CACHE_MISSES;
1634   } else if (flags_->perf_event_ == "STALL") {
1635     pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
1636     pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
1637   } else {
1638     pe.config = PERF_COUNT_HW_CPU_CYCLES;
1639     pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
1640   }
1641   perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
1642   if (perf_fd == -1) {
1643     MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
1644     return RET_ERROR;
1645   }
1646   perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
1647   if (perf_fd2 == -1) {
1648     MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
1649     return RET_ERROR;
1650   }
1651   struct PerfCount zero;
1652   zero.value[0] = 0;
1653   zero.value[1] = 0;
1654   // before callback
1655   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1656                              const std::vector<mindspore::MSTensor> &before_outputs,
1657                              const MSCallBackParam &call_param) {
1658     if (before_inputs.empty()) {
1659       MS_LOG(INFO) << "The num of beforeInputs is empty";
1660     }
1661     if (before_outputs.empty()) {
1662       MS_LOG(INFO) << "The num of beforeOutputs is empty";
1663     }
1664     if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
1665       op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
1666     }
1667     if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
1668       op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
1669     }
1670 
1671     op_call_times_total_++;
1672     ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
1673     ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
1674     return true;
1675   };
1676 
1677   // after callback
1678   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1679                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1680     struct PerfResult res;
1681     ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
1682     if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
1683       MS_LOG(ERROR) << "Failed to read perf_fd";
1684       return false;
1685     }
1686 
1687     if (after_inputs.empty()) {
1688       MS_LOG(INFO) << "The num of after inputs is empty";
1689     }
1690     if (after_outputs.empty()) {
1691       MS_LOG(INFO) << "The num of after outputs is empty";
1692     }
1693     float cost1 = static_cast<float>(res.values[0].value);
1694     float cost2 = static_cast<float>(res.values[1].value);
1695     op_cost_total_ += cost1;
1696     op_cost2_total_ += cost2;
1697     op_perf_by_type_[call_param.node_type].first++;
1698     op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
1699     op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
1700     op_perf_by_name_[call_param.node_name].first++;
1701     op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
1702     op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
1703     return true;
1704   };
1705 #endif
1706   return RET_OK;
1707 }
1708 
1709 namespace {
1710 template <typename T>
DataToString(void * data,size_t data_number)1711 std::string DataToString(void *data, size_t data_number) {
1712   if (data == nullptr) {
1713     return "Data of tensor is nullptr";
1714   }
1715   std::ostringstream oss;
1716   auto casted_data = static_cast<T *>(data);
1717   for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
1718     oss << " " << casted_data[i];
1719   }
1720   return oss.str();
1721 }
1722 
DumpMSTensor(mindspore::MSTensor * tensor)1723 std::string DumpMSTensor(mindspore::MSTensor *tensor) {
1724   if (tensor == nullptr) {
1725     return "Tensor is nullptr";
1726   }
1727   std::ostringstream oss;
1728   oss << " DataType: " << static_cast<int>(tensor->DataType());
1729   oss << " Shape:";
1730   for (auto &dim : tensor->Shape()) {
1731     oss << " " << dim;
1732   }
1733   oss << std::endl << " Data:";
1734   switch (static_cast<int>(tensor->DataType())) {
1735     case kNumberTypeFloat32: {
1736       oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
1737     } break;
1738     case kNumberTypeFloat16: {
1739       oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1740     } break;
1741     case kNumberTypeInt32: {
1742       oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
1743     } break;
1744     case kNumberTypeInt16: {
1745       oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1746     } break;
1747     case kNumberTypeInt8: {
1748       oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
1749     } break;
1750     default:
1751       oss << "Unsupported data type to print";
1752       break;
1753   }
1754   return oss.str();
1755 }
1756 #ifndef BENCHMARK_CLIP_JSON
GenerateOutputFileName(mindspore::MSTensor * tensor,const std::string & op_name,const std::string & file_type,const size_t & idx)1757 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
1758                                    const std::string &file_type, const size_t &idx) {
1759   std::string file_name = op_name;
1760   auto pos = file_name.find_first_of('/');
1761   while (pos != std::string::npos) {
1762     file_name.replace(pos, 1, ".");
1763     pos = file_name.find_first_of('/');
1764   }
1765   file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
1766   for (const auto &dim : tensor->Shape()) {
1767     file_name += std::to_string(dim) + "_";
1768   }
1769   if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
1770     file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
1771   }
1772   auto tensor_format = tensor->format();
1773   if (kTensorFormatMap.find(tensor_format) != kTensorFormatMap.end()) {
1774     file_name += "_" + kTensorFormatMap.at(tensor_format) + ".bin";
1775   } else {
1776     file_name += +".bin";
1777   }
1778 
1779   return file_name;
1780 }
1781 #endif
1782 }  // namespace
1783 
InitPrintTensorDataCallbackParameter()1784 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
1785   // before callback
1786   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1787                              const std::vector<mindspore::MSTensor> &before_outputs,
1788                              const MSCallBackParam &call_param) { return true; };
1789 
1790   // after callback
1791   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1792                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1793     std::cout << "================================================================" << std::endl;
1794     std::cout << call_param.node_name << " inputs : " << std::endl;
1795     for (auto ms_tensor : after_inputs) {
1796       std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1797     }
1798     std::cout << "----------------------------------------------------------------" << std::endl;
1799     std::cout << call_param.node_name << " outputs : " << std::endl;
1800     for (auto ms_tensor : after_outputs) {
1801       std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1802     }
1803     std::cout << "================================================================" << std::endl;
1804     return true;
1805   };
1806   return RET_OK;
1807 }
InitDumpTensorDataCallbackParameter()1808 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
1809 #ifndef BENCHMARK_CLIP_JSON
1810   // before callback
1811   ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1812                              const std::vector<mindspore::MSTensor> &before_outputs,
1813                              const MSCallBackParam &call_param) {
1814     auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1815     auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1816     auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1817     if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1818       if (input_output_mode == 0 || input_output_mode == 1) {
1819         for (size_t i = 0; i < before_inputs.size(); i++) {
1820           auto ms_tensor = before_inputs.at(i);
1821           auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
1822           auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1823           if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
1824             MS_LOG(ERROR) << "write tensor data to file failed.";
1825             return false;
1826           }
1827         }
1828       }
1829     }
1830     return true;
1831   };
1832 
1833   // after callback
1834   ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1835                             const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1836     auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1837     auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1838     auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1839     if (dump_mode == kDumpInputsAndOutputs ||
1840         std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1841       if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
1842         for (size_t i = 0; i < after_outputs.size(); i++) {
1843           auto ms_tensor = after_outputs.at(i);
1844           auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
1845           auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1846           if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
1847             MS_LOG(ERROR) << "write tensor data to file failed.";
1848             return false;
1849           }
1850         }
1851       }
1852     }
1853     return true;
1854   };
1855 #endif
1856   return RET_OK;
1857 }
1858 
~BenchmarkUnifiedApi()1859 BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {
1860 #ifdef PARALLEL_INFERENCE
1861   if (!flags_->enable_parallel_predict_) {
1862     return;
1863   }
1864   for (auto tensor : ms_inputs_for_api_) {
1865     auto data = tensor.MutableData();
1866     if (data != nullptr) {
1867       tensor.SetData(nullptr);
1868     }
1869   }
1870   for (auto &input : all_inputs_data_) {
1871     for (auto &data : input) {
1872       if (data != nullptr) {
1873         auto buf = static_cast<char *>(data);
1874         delete[] buf;
1875         data = nullptr;
1876       }
1877     }
1878   }
1879 #endif
1880 }
1881 }  // namespace lite
1882 }  // namespace mindspore
1883