1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "tools/benchmark/benchmark_unified_api.h"
18 #include <cinttypes>
19 #include <algorithm>
20 #include <utility>
21 #include <functional>
22 #include <iomanip>
23 #include <limits>
24 #include "src/common/common.h"
25 #include "src/tensor.h"
26 #include "tools/common/string_util.h"
27 #include "nnacl/nnacl_common.h"
28 #ifdef ENABLE_ARM64
29 #include <linux/perf_event.h>
30 #include <sys/ioctl.h>
31 #include <asm/unistd.h>
32 #include <unistd.h>
33 #endif
34 #ifdef SUPPORT_NNIE
35 #include "include/hi_common.h"
36 #include "include/hi_comm_vb.h"
37 #include "include/mpi_sys.h"
38 #include "include/mpi_vb.h"
39 #endif
40 #ifdef PARALLEL_INFERENCE
41 #include <thread>
42 #include "src/common/config_file.h"
43 #endif
44 #include "include/c_api/model_c.h"
45 #include "include/c_api/context_c.h"
46
47 namespace mindspore {
48 constexpr size_t kDataToStringMaxNum = 40;
49 constexpr int kPrintDataNum = 20;
50 constexpr int kFrequencyDefault = 3;
51 constexpr int kPercentageDivisor = 100;
52 constexpr int kDumpInputsAndOutputs = 0;
53 constexpr int kDumpOutputs = 2;
54 #ifdef PARALLEL_INFERENCE
55 constexpr int kMaxRequestNum = 200;
56 #endif
57 namespace lite {
GenerateGLTexture(std::map<std::string,GLuint> * input_gl_texture)58 int BenchmarkUnifiedApi::GenerateGLTexture(std::map<std::string, GLuint> *input_gl_texture) {
59 for (auto tensor : ms_inputs_for_api_) {
60 float *input_data = reinterpret_cast<float *>(malloc(tensor.DataSize()));
61 if (input_data == nullptr) {
62 MS_LOG(ERROR) << "new input_data failed";
63 return RET_ERROR;
64 }
65 int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
66 if (status != RET_OK) {
67 free(input_data);
68 std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
69 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
70 return status;
71 }
72 status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), input_data);
73 free(input_data);
74 if (status != RET_OK) {
75 MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
76 return status;
77 }
78 }
79
80 return RET_OK;
81 }
82
FillGLTextureToTensor(std::map<std::string,GLuint> * gl_texture,mindspore::MSTensor * tensor,std::string name,void * data)83 int BenchmarkUnifiedApi::FillGLTextureToTensor(std::map<std::string, GLuint> *gl_texture, mindspore::MSTensor *tensor,
84 std::string name, void *data) {
85 MS_CHECK_TRUE_MSG(gl_texture != nullptr, RET_ERROR, "gl_texture is nullptr");
86 MS_CHECK_TRUE_MSG(tensor != nullptr, RET_ERROR, "tensor is nullptr");
87
88 auto image_id = 0;
89
90 int width = 1, height = 1, channel = 1;
91 if (tensor->Shape().size() == DIMENSION_2D) {
92 height = tensor->Shape()[kNHWC_N];
93 channel = tensor->Shape()[kNHWC_H];
94 } else if (tensor->Shape().size() == DIMENSION_3D) {
95 width = tensor->Shape()[kNHWC_H];
96 height = tensor->Shape()[kNHWC_N];
97 channel = tensor->Shape()[kNHWC_C];
98 } else if (tensor->Shape().size() == DIMENSION_4D) {
99 width = tensor->Shape()[kNHWC_W];
100 height = tensor->Shape()[kNHWC_H];
101 channel = tensor->Shape()[kNHWC_C];
102 } else {
103 MS_LOG(ERROR) << "the tensor shape is not support";
104 return RET_ERROR;
105 }
106
107 if (data == nullptr) {
108 image_id = gl_runtime_.GLCreateTexture(width, height, channel);
109 } else {
110 image_id = gl_runtime_.CopyHostToDeviceTexture(data, width, height, channel);
111 }
112
113 if (image_id != GL_NONE) {
114 gl_texture->insert(std::pair<std::string, GLuint>(name, image_id));
115 } else {
116 MS_LOG(ERROR) << "glMemPool CopyHostToDeviceTexture failed";
117 }
118 return RET_OK;
119 }
120
LoadAndBindGLTexture()121 int BenchmarkUnifiedApi::LoadAndBindGLTexture() {
122 std::map<std::string, GLuint> input_gl_texture;
123 std::map<std::string, GLuint> output_gl_texture;
124
125 if (flags_->in_data_file_.empty()) {
126 auto status = GenerateGLTexture(&input_gl_texture);
127 if (status != RET_OK) {
128 std::cerr << "Generate input GLTexture error " << status << std::endl;
129 MS_LOG(ERROR) << "Generate input GLTexture error " << status;
130 return status;
131 }
132 } else {
133 auto status = ReadGLTextureFile(&input_gl_texture);
134 if (status != RET_OK) {
135 std::cerr << "ReadGLTextureFile error, " << status << std::endl;
136 MS_LOG(ERROR) << "ReadGLTextureFile error, " << status;
137 return status;
138 }
139 }
140
141 for (auto &tensor : ms_outputs_for_api_) {
142 auto status = FillGLTextureToTensor(&output_gl_texture, &tensor, tensor.Name());
143 if (status != RET_OK) {
144 MS_LOG(ERROR) << "Fill GLTexture to output tensor" << status;
145 return status;
146 }
147 }
148
149 auto status = ms_model_.BindGLTexture2DMemory(input_gl_texture, &output_gl_texture);
150 if (status != kSuccess) {
151 MS_LOG(ERROR) << "BindGLTexture2DMemory failed";
152 return RET_ERROR;
153 }
154 return RET_OK;
155 }
156
ReadGLTextureFile(std::map<std::string,GLuint> * input_gl_texture)157 int BenchmarkUnifiedApi::ReadGLTextureFile(std::map<std::string, GLuint> *input_gl_texture) {
158 if (ms_inputs_for_api_.empty()) {
159 return RET_OK;
160 }
161 if (this->flags_->in_data_type_ == kImage) {
162 MS_LOG(ERROR) << "Not supported image input";
163 return RET_ERROR;
164 } else {
165 for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
166 auto tensor = ms_inputs_for_api_.at(i);
167 size_t size;
168 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
169 if (bin_buf == nullptr) {
170 MS_LOG(ERROR) << "ReadFile return nullptr";
171 return RET_ERROR;
172 }
173 auto tensor_data_size = tensor.DataSize();
174 if (size != tensor_data_size) {
175 std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
176 << std::endl;
177 MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
178 delete[] bin_buf;
179 return RET_ERROR;
180 }
181
182 auto status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), bin_buf);
183 delete[] bin_buf;
184 if (status != RET_OK) {
185 MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
186 return status;
187 }
188 }
189 }
190
191 return RET_OK;
192 }
193
LoadInput()194 int BenchmarkUnifiedApi::LoadInput() {
195 if (flags_->enable_gl_texture_ == true) {
196 if (lite::BenchmarkUnifiedApi::LoadAndBindGLTexture() != RET_OK) {
197 MS_LOG(ERROR) << "Generate input GLTexture error";
198 return RET_ERROR;
199 }
200 return RET_OK;
201 }
202
203 if (flags_->in_data_file_.empty()) {
204 auto status = GenerateInputData();
205 if (status != RET_OK) {
206 std::cerr << "Generate input data error " << status << std::endl;
207 MS_LOG(ERROR) << "Generate input data error " << status;
208 return status;
209 }
210 } else {
211 auto status = ReadInputFile();
212 if (status != RET_OK) {
213 std::cerr << "ReadInputFile error, " << status << std::endl;
214 MS_LOG(ERROR) << "ReadInputFile error, " << status;
215 return status;
216 }
217 }
218 return RET_OK;
219 }
220
GenerateInputData()221 int BenchmarkUnifiedApi::GenerateInputData() {
222 #ifdef PARALLEL_INFERENCE
223 if (flags_->enable_parallel_predict_) {
224 std::vector<void *> inputs;
225 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
226 auto tensor_name = ms_inputs_for_api_[i].Name();
227 size_t size;
228 if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeFloat32)) {
229 size = sizeof(float);
230 } else if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeInt32)) {
231 size = sizeof(int32_t);
232 } else {
233 MS_LOG(ERROR) << "not support in model pool.";
234 return RET_ERROR;
235 }
236 for (size_t j = 0; j < flags_->resize_dims_[i].size(); j++) {
237 size *= flags_->resize_dims_[i][j];
238 }
239 void *input_data = new (std::nothrow) char[size];
240 if (input_data == nullptr) {
241 MS_LOG(ERROR) << "new input_data failed";
242 for (auto &data : inputs) {
243 auto buf = static_cast<char *>(data);
244 delete[] buf;
245 data = nullptr;
246 }
247 return RET_ERROR;
248 }
249 inputs.push_back(input_data);
250 int status = GenerateRandomData(size, input_data, static_cast<int>(ms_inputs_for_api_[i].DataType()));
251 if (status != RET_OK) {
252 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
253 for (auto &data : inputs) {
254 auto buf = static_cast<char *>(data);
255 delete[] buf;
256 data = nullptr;
257 }
258 return status;
259 }
260 }
261 all_inputs_data_.push_back(inputs);
262 return RET_OK;
263 }
264 #endif
265 for (auto &tensor : ms_inputs_for_api_) {
266 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
267 MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
268 if (input == nullptr) {
269 std::cerr << "StringsToTensor failed" << std::endl;
270 MS_LOG(ERROR) << "StringsToTensor failed";
271 return RET_ERROR;
272 }
273 tensor = *input;
274 delete input;
275 } else {
276 auto input_data = tensor.MutableData();
277 if (input_data == nullptr) {
278 MS_LOG(ERROR) << "MallocData for inTensor failed";
279 return RET_ERROR;
280 }
281 int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
282 if (status != RET_OK) {
283 std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
284 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
285 return status;
286 }
287 }
288 }
289 return RET_OK;
290 }
291
UpdateConfigInfo()292 void BenchmarkUnifiedApi::UpdateConfigInfo() {
293 #define WIPE_DEEP_CONFIG_ENV '0'
294 #define WIPE_DEEP_CONFIG_VOCAB_SIZE "100"
295 #define WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE "40"
296
297 auto env = std::getenv("BENCHMARK_UPDATE_CONFIG_ENV");
298 if (env == nullptr) {
299 return;
300 }
301 if (env[0] == WIPE_DEEP_CONFIG_ENV) {
302 ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheVocabSizeKey, WIPE_DEEP_CONFIG_VOCAB_SIZE));
303 ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheDeviceSizeKey, WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE));
304 }
305 return;
306 }
307
ReadInputFile()308 int BenchmarkUnifiedApi::ReadInputFile() {
309 #ifdef PARALLEL_INFERENCE
310 if (flags_->enable_parallel_predict_) {
311 std::vector<void *> inputs;
312 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
313 size_t size;
314 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
315 if (bin_buf == nullptr) {
316 MS_LOG(ERROR) << "ReadFile return nullptr";
317 for (auto &data : inputs) {
318 auto buf = static_cast<char *>(data);
319 delete[] buf;
320 data = nullptr;
321 }
322 return RET_ERROR;
323 }
324 inputs.push_back(bin_buf);
325 }
326 all_inputs_data_.push_back(inputs);
327 return RET_OK;
328 }
329 #endif
330 if (ms_inputs_for_api_.empty()) {
331 return RET_OK;
332 }
333
334 if (this->flags_->in_data_type_ == kImage) {
335 MS_LOG(ERROR) << "Not supported image input";
336 return RET_ERROR;
337 } else {
338 for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
339 auto &cur_tensor = ms_inputs_for_api_.at(i);
340 size_t size;
341 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
342 if (bin_buf == nullptr) {
343 MS_LOG(ERROR) << "ReadFile return nullptr";
344 return RET_ERROR;
345 }
346 if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
347 std::string str(bin_buf, size);
348 MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
349 if (input == nullptr) {
350 std::cerr << "StringsToTensor failed" << std::endl;
351 MS_LOG(ERROR) << "StringsToTensor failed";
352 delete[] bin_buf;
353 return RET_ERROR;
354 }
355 cur_tensor = *input;
356 } else {
357 auto tensor_data_size = cur_tensor.DataSize();
358 if (size != tensor_data_size) {
359 std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
360 << std::endl;
361 MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
362 delete[] bin_buf;
363 return RET_ERROR;
364 }
365 auto input_data = cur_tensor.MutableData();
366 if (input_data == nullptr) {
367 MS_LOG(ERROR) << "input_data is nullptr.";
368 delete[] bin_buf;
369 return RET_ERROR;
370 }
371 memcpy(input_data, bin_buf, tensor_data_size);
372 }
373 delete[] bin_buf;
374 }
375 }
376 return RET_OK;
377 }
378
GetDataTypeByTensorName(const std::string & tensor_name)379 int BenchmarkUnifiedApi::GetDataTypeByTensorName(const std::string &tensor_name) {
380 #ifdef PARALLEL_INFERENCE
381 for (auto tensor : ms_outputs_for_api_) {
382 auto name = tensor.Name();
383 if (name == tensor_name) {
384 return static_cast<int>(tensor.DataType());
385 }
386 }
387 MS_LOG(ERROR) << "not find tensor name : " << tensor_name << " in model output.";
388 return static_cast<int>(DataType::kTypeUnknown);
389 #endif
390 return static_cast<int>(ms_model_.GetOutputByTensorName(tensor_name).DataType());
391 }
392
UpdateDistributionName(const std::shared_ptr<mindspore::Context> & context,std::string * name)393 void BenchmarkUnifiedApi::UpdateDistributionName(const std::shared_ptr<mindspore::Context> &context,
394 std::string *name) {
395 if (flags_->device_ != "GPU") {
396 return;
397 }
398
399 if (name->size() == 0) {
400 return;
401 }
402
403 if (context->MutableDeviceInfo().size() == 0) {
404 return;
405 }
406
407 auto device_info = context->MutableDeviceInfo().front();
408 GPUDeviceInfo *gpu_info = reinterpret_cast<GPUDeviceInfo *>(device_info.get());
409 auto rank_id = gpu_info->GetRankID();
410 if (rank_id == 0) {
411 return;
412 }
413 gpu_info->SetDeviceID(rank_id);
414
415 /* model file & benchmark data file: include .mindir
416 config file : include .config */
417 auto replace_pos = name->find(".mindir");
418 if (replace_pos == std::string::npos) {
419 replace_pos = name->find(".config");
420 }
421
422 if (replace_pos == std::string::npos) {
423 return;
424 }
425
426 *name = name->replace(replace_pos, sizeof('.'), std::to_string(rank_id) + ".");
427
428 MS_LOG(INFO) << "Update distribution info: " << *name;
429 std::cout << "Update distribution info: " << *name << std::endl;
430 return;
431 }
432
InitMSContextForGPU(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)433 void BenchmarkUnifiedApi::InitMSContextForGPU(const std::shared_ptr<mindspore::Context> &context,
434 std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
435 std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
436 gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
437 uint32_t device_id = 0;
438 auto device_id_env = std::getenv("GPU_DEVICE_ID");
439 if (device_id_env != nullptr) {
440 // try {
441 device_id = static_cast<uint32_t>(std::stoul(device_id_env));
442 // } catch (std::invalid_argument &e) {
443 // MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
444 // }
445 MS_LOG(INFO) << "GPU device_id = " << device_id;
446 }
447 gpu_device_info->SetDeviceID(device_id);
448 if (flags_->device_id_ >= 0) {
449 gpu_device_info->SetDeviceID(flags_->device_id_);
450 MS_LOG(INFO) << "GPU device_id = " << flags_->device_id_;
451 }
452 if (flags_->enable_gl_texture_) {
453 gpu_device_info->SetEnableGLTexture(flags_->enable_gl_texture_);
454
455 auto gl_context = eglGetCurrentContext();
456 gpu_device_info->SetGLContext(gl_context);
457
458 auto gl_display = eglGetCurrentDisplay();
459 gpu_device_info->SetGLDisplay(gl_display);
460 } else {
461 gpu_device_info->SetProvider("tensorrt");
462 gpu_device_info->SetAllocator(nullptr);
463 }
464 device_list->push_back(gpu_device_info);
465 }
466
InitMSContextForAscend(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)467 void BenchmarkUnifiedApi::InitMSContextForAscend(const std::shared_ptr<mindspore::Context> &context,
468 std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
469 uint32_t device_id = 0;
470 auto device_id_env = std::getenv("ASCEND_DEVICE_ID");
471 if (device_id_env != nullptr) {
472 // try {
473 device_id = static_cast<uint32_t>(std::stoul(device_id_env));
474 // } catch (std::invalid_argument &e) {
475 // MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
476 // }
477 MS_LOG(INFO) << "Ascend device_id = " << device_id;
478 }
479 std::shared_ptr<AscendDeviceInfo> ascend_device_info = std::make_shared<AscendDeviceInfo>();
480 ascend_device_info->SetDeviceID(device_id);
481 ascend_device_info->SetProvider(flags_->provider_);
482 auto back_policy_env = std::getenv("ASCEND_BACK_POLICY");
483 if (back_policy_env != nullptr) {
484 ascend_device_info->SetProvider(back_policy_env);
485 }
486 #ifdef ENABLE_CLOUD_FUSION_INFERENCE
487 if (flags_->device_id_ >= 0 && flags_->rank_id_ >= 0) {
488 ascend_device_info->SetDeviceID(flags_->device_id_);
489 ascend_device_info->SetRankID(flags_->rank_id_);
490 ascend_device_info->SetProvider("ge");
491 }
492 #endif
493 device_list->push_back(ascend_device_info);
494 }
495
InitMSContext(const std::shared_ptr<mindspore::Context> & context)496 int BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
497 context->SetThreadNum(flags_->num_threads_);
498 context->SetGroupInfoFile(flags_->group_info_file_);
499 context->SetThreadAffinity(flags_->cpu_bind_mode_);
500 context->SetInterOpParallelNum(flags_->inter_op_parallel_num_);
501 if (!flags_->core_list_.empty()) {
502 context->SetThreadAffinity(flags_->core_list_);
503 }
504 #ifndef ENABLE_CLOUD_FUSION_INFERENCE
505 if (flags_->delegate_mode_ == "CoreML") {
506 context->SetBuiltInDelegate(kCoreML);
507 } else if (flags_->delegate_mode_ == "NNAPI") {
508 context->SetBuiltInDelegate(kNNAPI);
509 }
510 context->SetEnableParallel(flags_->enable_parallel_);
511 #endif
512
513 auto &device_list = context->MutableDeviceInfo();
514 if (flags_->device_ == "GPU" || flags_->device_ == "Auto") {
515 InitMSContextForGPU(context, &device_list);
516 }
517
518 if (flags_->device_ == "NPU" || flags_->device_ == "Auto") {
519 std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
520 npu_device_info->SetEnableFP16(flags_->enable_fp16_);
521 npu_device_info->SetFrequency(kFrequencyDefault);
522 device_list.push_back(npu_device_info);
523 }
524
525 if (flags_->device_ == "Ascend" || flags_->device_ == "Auto") {
526 MS_LOG(ERROR) << "OHOS not support Ascend devices.";
527 return RET_NOT_SUPPORT;
528 }
529
530 if (flags_->device_ == "NNRT" || flags_->device_ == "Auto") {
531 std::shared_ptr<NNRTDeviceInfo> nnrt_device_info = std::make_shared<NNRTDeviceInfo>();
532 size_t num = 0;
533 auto descs = OH_AI_GetAllNNRTDeviceDescs(&num);
534 NNRTDeviceDesc *desc_nnrt = nullptr;
535 for (size_t i = 0; i < num; i++) {
536 auto desc = OH_AI_GetElementOfNNRTDeviceDescs(descs, i);
537 auto name = OH_AI_GetNameFromNNRTDeviceDesc(desc);
538 if (strncmp(name, "NPU_", 4) == 0 ) { // npu推理,在线编译
539 desc_nnrt = desc;
540 break;
541 }
542 }
543 if (desc_nnrt == nullptr) {
544 BENCHMARK_LOG_ERROR("nnrt desc get failed");
545 return RET_ERROR;
546 }
547 auto id = OH_AI_GetDeviceIdFromNNRTDeviceDesc(desc_nnrt);
548 nnrt_device_info->SetDeviceID(id);
549 OH_AI_DestroyAllNNRTDeviceDescs(&descs);
550 device_list.push_back(nnrt_device_info);
551 }
552
553 // CPU priority is behind GPU and NPU
554 std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
555 device_info->SetEnableFP16(flags_->enable_fp16_);
556 device_info->SetProvider(flags_->provider_);
557 device_list.push_back(device_info);
558
559 return RET_OK;
560 }
561 #ifdef PARALLEL_INFERENCE
CompareOutputForModelPool(std::vector<mindspore::MSTensor> * outputs)562 int BenchmarkUnifiedApi::CompareOutputForModelPool(std::vector<mindspore::MSTensor> *outputs) {
563 if (outputs->empty()) {
564 MS_LOG(ERROR) << "outputs is empty.";
565 return RET_ERROR;
566 }
567 std::cout << "================ Comparing Output data ================" << std::endl;
568 float total_bias = 0;
569 int total_size = 0;
570 // check the output tensor name.
571 for (size_t i = 0; i < outputs->size(); i++) {
572 std::string tensor_name = outputs->at(i).Name();
573 mindspore::MSTensor tensor = outputs->at(i);
574 if (tensor == nullptr) {
575 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
576 return RET_ERROR;
577 }
578 constexpr float kParallelRelative = 1e-7;
579 constexpr float kParallelAbsolute = 1e-10;
580 int ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size, kParallelRelative,
581 kParallelAbsolute);
582 if (ret != RET_OK) {
583 MS_LOG(ERROR) << "Error in CompareData";
584 std::cerr << "Error in CompareData" << std::endl;
585 std::cout << "=======================================================" << std::endl << std::endl;
586 return ret;
587 }
588 }
589 float mean_bias;
590 if (total_size != 0) {
591 mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
592 } else {
593 mean_bias = 0;
594 }
595
596 std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
597 std::cout << "=======================================================" << std::endl << std::endl;
598
599 if (mean_bias > this->flags_->accuracy_threshold_) {
600 MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
601 std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
602 return RET_ERROR;
603 }
604 return RET_OK;
605 }
606 #endif
607
Convert2Float32(float * __restrict out,const uint16_t in)608 void Convert2Float32(float *__restrict out, const uint16_t in) {
609 uint32_t t1;
610 uint32_t t2;
611 uint32_t t3;
612
613 t1 = in & 0x7fffu;
614 t2 = in & 0x8000u;
615 t3 = in & 0x7c00u;
616
617 t1 <<= 13u;
618 t2 <<= 16u;
619
620 t1 += 0x38000000;
621
622 t1 = (t3 == 0 ? 0 : t1);
623
624 t1 |= t2;
625
626 *(out) = static_cast<float>(t1);
627 }
628
629 namespace {
630 template <typename T>
VectorValueCompare(const std::vector<T> & vec1,const std::vector<T> & vec2)631 bool VectorValueCompare(const std::vector<T> &vec1, const std::vector<T> &vec2) {
632 if (vec1.size() != vec2.size()) {
633 return false;
634 }
635 for (auto &ele : vec1) {
636 if (!IsContain(vec2, ele)) {
637 return false;
638 }
639 }
640 return true;
641 }
642 } // namespace
643
CompareOutput()644 int BenchmarkUnifiedApi::CompareOutput() {
645 std::cout << "================ Comparing Output data ================" << std::endl;
646 float total_bias = 0;
647 int total_size = 0;
648 // check the output tensor name.
649 if (!VectorValueCompare(this->benchmark_tensor_names_, ms_model_.GetOutputTensorNames())) {
650 MS_LOG(ERROR) << "The output tensor name is wrong.";
651 return RET_ERROR;
652 }
653 for (const auto &calib_tensor : benchmark_data_) {
654 std::string tensor_name = calib_tensor.first;
655 mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
656 if (tensor == nullptr) {
657 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
658 return RET_ERROR;
659 }
660 int ret;
661 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
662 std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
663 ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
664 } else {
665 if (flags_->enable_gl_texture_) {
666 auto *gltexture_id = reinterpret_cast<GLuint *>(tensor.MutableData());
667 if (gltexture_id == nullptr) {
668 MS_LOG(ERROR) << "get gltexture_id failed";
669 return RET_ERROR;
670 }
671 auto tmp = gl_runtime_.CopyDeviceTextureToHost(*gltexture_id);
672 if (tmp == nullptr) {
673 MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
674 return RET_ERROR;
675 }
676 float *hostptr = reinterpret_cast<float *>(tmp);
677
678 auto tensor_shape = tensor.Shape();
679 auto data_len =
680 std::accumulate(tensor_shape.begin(), tensor_shape.end(), sizeof(float), std::multiplies<size_t>());
681 auto *new_tensor = new (std::nothrow)
682 MSTensor(tensor_name, mindspore::DataType::kNumberTypeFloat32, tensor_shape, hostptr, data_len);
683 MS_CHECK_TRUE_MSG(new_tensor != nullptr, RET_ERROR, "new tensor failed");
684 if (new_tensor->MutableData() == nullptr) {
685 MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
686 delete new_tensor;
687 return RET_ERROR;
688 }
689 ret = CompareDataGetTotalBiasAndSize(tensor_name, new_tensor, &total_bias, &total_size);
690 delete new_tensor;
691 } else {
692 ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
693 }
694 }
695 if (ret != RET_OK) {
696 MS_LOG(ERROR) << "Error in CompareData";
697 std::cerr << "Error in CompareData" << std::endl;
698 std::cout << "=======================================================" << std::endl << std::endl;
699 return ret;
700 }
701 }
702 float mean_bias;
703 if (total_size != 0) {
704 mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
705 } else {
706 mean_bias = 0;
707 }
708
709 std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
710 std::cout << "=======================================================" << std::endl << std::endl;
711
712 if (mean_bias > this->flags_->accuracy_threshold_) {
713 MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
714 std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
715 return RET_ERROR;
716 }
717 return RET_OK;
718 }
719
CompareOutputByCosineDistance(float cosine_distance_threshold)720 int BenchmarkUnifiedApi::CompareOutputByCosineDistance(float cosine_distance_threshold) {
721 std::cout << "================ Comparing Output data ================" << std::endl;
722 float total_cosine_distance = 0;
723 int total_size = 0;
724 // check the output tensor name.
725 if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
726 MS_LOG(ERROR) << "The output tensor name is wrong.";
727 return RET_ERROR;
728 }
729 for (const auto &calib_tensor : benchmark_data_) {
730 std::string tensor_name = calib_tensor.first;
731 mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
732 if (tensor == nullptr) {
733 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
734 return RET_ERROR;
735 }
736 int ret;
737 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
738 std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
739 ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
740 } else {
741 ret = CompareDataGetTotalCosineDistanceAndSize(tensor_name, &tensor, &total_cosine_distance, &total_size);
742 }
743 if (ret != RET_OK) {
744 MS_LOG(ERROR) << "Error in CompareData";
745 std::cerr << "Error in CompareData" << std::endl;
746 std::cout << "=======================================================" << std::endl << std::endl;
747 return ret;
748 }
749 }
750 float mean_cosine_distance;
751 if (total_size != 0) {
752 mean_cosine_distance = total_cosine_distance / float_t(total_size);
753 } else {
754 mean_cosine_distance = CosineErrMaxVal;
755 }
756 mean_cosine_distance = 1 - mean_cosine_distance;
757 std::cout << "Cosine distance of all nodes/tensors: " << std::setprecision(std::numeric_limits<double>::digits10)
758 << mean_cosine_distance << std::endl;
759 std::cout << "=======================================================" << std::endl << std::endl;
760
761 if (mean_cosine_distance < cosine_distance_threshold) {
762 MS_LOG(ERROR) << "cosine distance of all nodes/tensors is too small: " << mean_cosine_distance;
763 std::cerr << "Mean cosine distance of all nodes/tensors is too small: " << mean_cosine_distance << std::endl;
764 return RET_ERROR;
765 }
766 return RET_OK;
767 }
768
CompareDataGetTotalBiasAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_bias,int * total_size,float relative_tolerance,float absolute_tolerance)769 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
770 float *total_bias, int *total_size, float relative_tolerance,
771 float absolute_tolerance) {
772 float bias = 0;
773 auto mutableData = tensor->MutableData();
774 if (mutableData == nullptr) {
775 MS_LOG(ERROR) << "mutableData is nullptr.";
776 return RET_ERROR;
777 }
778 switch (static_cast<int>(tensor->DataType())) {
779 case TypeId::kNumberTypeFloat:
780 case TypeId::kNumberTypeFloat32: {
781 bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
782 break;
783 }
784 case TypeId::kNumberTypeInt8: {
785 bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
786 break;
787 }
788 case TypeId::kNumberTypeUInt8: {
789 bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
790 break;
791 }
792 case TypeId::kNumberTypeInt32: {
793 bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
794 break;
795 }
796 case TypeId::kNumberTypeInt16: {
797 bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
798 break;
799 }
800 case TypeId::kNumberTypeBool: {
801 bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
802 break;
803 }
804 case TypeId::kNumberTypeFloat16: {
805 size_t shapeSize = 1;
806 for (int64_t dim : tensor->Shape()) {
807 if (dim <= 0) {
808 MS_LOG(ERROR) << "The shape of output " << name << " should be great than 0 after inference, got "
809 << tensor->Shape();
810 return RET_ERROR;
811 }
812 MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
813 shapeSize *= static_cast<size_t>(dim);
814 }
815 auto *floatArr = new float[shapeSize];
816 for (size_t i = 0; i < shapeSize; ++i) {
817 uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
818 floatArr[i] = ShortToFloat32(tmpInt);
819 }
820 bias = CompareData<float, int64_t>(name, tensor->Shape(), floatArr);
821 delete[] floatArr;
822 break;
823 }
824 default:
825 MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
826 return RET_ERROR;
827 }
828 if (bias < 0) {
829 MS_LOG(ERROR) << "CompareData failed, name: " << name;
830 return RET_ERROR;
831 }
832 *total_bias += bias;
833 *total_size += 1;
834 return RET_OK;
835 }
CompareDataGetTotalCosineDistanceAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_cosine_distance,int * total_size)836 int BenchmarkUnifiedApi::CompareDataGetTotalCosineDistanceAndSize(const std::string &name, mindspore::MSTensor *tensor,
837 float *total_cosine_distance, int *total_size) {
838 if (tensor == nullptr) {
839 MS_LOG(ERROR) << "tensor is nullptr.";
840 return RET_ERROR;
841 }
842 if (total_cosine_distance == nullptr) {
843 MS_LOG(ERROR) << "total_cosine_distance is nullptr.";
844 return RET_ERROR;
845 }
846 if (total_size == nullptr) {
847 MS_LOG(ERROR) << "total_size is nullptr.";
848 return RET_ERROR;
849 }
850 float bias = 0;
851 auto mutableData = tensor->MutableData();
852 if (mutableData == nullptr) {
853 MS_LOG(ERROR) << "mutableData is nullptr.";
854 return RET_ERROR;
855 }
856 int res = RET_OK;
857 switch (static_cast<int>(tensor->DataType())) {
858 case TypeId::kNumberTypeFloat:
859 case TypeId::kNumberTypeFloat32: {
860 res = CompareDatabyCosineDistance<float>(name, tensor->Shape(), mutableData, &bias);
861 break;
862 }
863 case TypeId::kNumberTypeFloat16: {
864 size_t shapeSize = 1;
865 for (int64_t dim : tensor->Shape()) {
866 if (dim <= 0) {
867 MS_LOG(ERROR) << "Invalid shape.";
868 return RET_ERROR;
869 }
870 MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
871 shapeSize *= static_cast<size_t>(dim);
872 }
873 float *floatArr = new float[shapeSize];
874 for (size_t i = 0; i < shapeSize; ++i) {
875 uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
876 Convert2Float32(&floatArr[i], tmpInt);
877 reinterpret_cast<float *>(mutableData)[i] = floatArr[i];
878 }
879 delete[] floatArr;
880 bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
881 break;
882 }
883 case TypeId::kNumberTypeInt8: {
884 res = CompareDatabyCosineDistance<int8_t>(name, tensor->Shape(), mutableData, &bias);
885 break;
886 }
887 case TypeId::kNumberTypeUInt8: {
888 res = CompareDatabyCosineDistance<uint8_t>(name, tensor->Shape(), mutableData, &bias);
889 break;
890 }
891 case TypeId::kNumberTypeInt32: {
892 res = CompareDatabyCosineDistance<int32_t>(name, tensor->Shape(), mutableData, &bias);
893 break;
894 }
895 case TypeId::kNumberTypeInt16: {
896 res = CompareDatabyCosineDistance<int16_t>(name, tensor->Shape(), mutableData, &bias);
897 break;
898 }
899 case TypeId::kNumberTypeBool: {
900 res = CompareDatabyCosineDistance<bool>(name, tensor->Shape(), mutableData, &bias);
901 break;
902 }
903 default:
904 MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
905 return RET_ERROR;
906 }
907 if (res != RET_OK) {
908 MS_LOG(ERROR) << "CompareData failed, name: " << name;
909 return RET_ERROR;
910 }
911 *total_cosine_distance += 1 - bias;
912 *total_size += 1;
913 return RET_OK;
914 }
915
MarkPerformance()916 int BenchmarkUnifiedApi::MarkPerformance() {
917 MS_LOG(INFO) << "Running warm up loops...";
918 std::cout << "Running warm up loops..." << std::endl;
919 std::vector<MSTensor> outputs;
920 for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
921 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
922 if (status != kSuccess) {
923 MS_LOG(ERROR) << "Inference error ";
924 std::cerr << "Inference error " << std::endl;
925 return RET_ERROR;
926 }
927 }
928
929 MS_LOG(INFO) << "Running benchmark loops...";
930 std::cout << "Running benchmark loops..." << std::endl;
931 uint64_t time_min = UINT64_MAX;
932 uint64_t time_max = 0;
933 uint64_t time_avg = 0;
934
935 for (int i = 0; i < flags_->loop_count_; i++) {
936 auto inputs = ms_model_.GetInputs();
937 for (auto tensor : inputs) {
938 tensor.MutableData(); // prepare data
939 }
940 auto start = GetTimeUs();
941 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
942 if (status != kSuccess) {
943 MS_LOG(ERROR) << "Inference error ";
944 std::cerr << "Inference error ";
945 return RET_ERROR;
946 }
947
948 auto end = GetTimeUs();
949 auto time = end - start;
950 time_min = std::min(time_min, time);
951 time_max = std::max(time_max, time);
952 time_avg += time;
953 }
954
955 if (flags_->time_profiling_) {
956 const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
957 const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
958 (void)PrintResult(per_op_name, op_times_by_name_);
959 (void)PrintResult(per_op_type, op_times_by_type_);
960 #ifdef ENABLE_ARM64
961 } else if (flags_->perf_profiling_) {
962 if (flags_->perf_event_ == "CACHE") {
963 const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
964 const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
965 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
966 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
967 } else if (flags_->perf_event_ == "STALL") {
968 const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
969 "backendend(%)"};
970 const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
971 "backendend(%)"};
972 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
973 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
974 } else {
975 const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
976 const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
977 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
978 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
979 }
980 #endif
981 }
982
983 if (flags_->loop_count_ > 0) {
984 time_avg /= static_cast<size_t>(flags_->loop_count_);
985 MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
986 << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
987 << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
988 printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
989 flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
990 time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
991 }
992 return RET_OK;
993 }
994
MarkAccuracy()995 int BenchmarkUnifiedApi::MarkAccuracy() {
996 MS_LOG(INFO) << "MarkAccuracy";
997 std::cout << "MarkAccuracy" << std::endl;
998
999 int status = 0;
1000 if (flags_->enable_gl_texture_) {
1001 for (auto in_tensor : ms_inputs_for_api_) {
1002 auto *input = reinterpret_cast<GLuint *>(in_tensor.MutableData());
1003 if (input == nullptr) {
1004 MS_LOG(ERROR) << "get input data failed";
1005 return RET_ERROR;
1006 }
1007 float *hostptr = reinterpret_cast<float *>(gl_runtime_.CopyDeviceTextureToHost(*input));
1008 size_t print_num = 20;
1009 gl_runtime_.PrintImage2DData(hostptr, 1, 1, print_num);
1010 }
1011 } else {
1012 status = PrintInputData();
1013 if (status != RET_OK) {
1014 MS_LOG(ERROR) << "PrintInputData error " << status;
1015 std::cerr << "PrintInputData error " << status << std::endl;
1016 return status;
1017 }
1018 }
1019 std::vector<MSTensor> outputs;
1020 auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
1021 if (ret != kSuccess) {
1022 MS_LOG(ERROR) << "Inference error ";
1023 std::cerr << "Inference error " << std::endl;
1024 return RET_ERROR;
1025 }
1026 status = ReadCalibData();
1027 if (status != RET_OK) {
1028 MS_LOG(ERROR) << "Read calib data error " << status;
1029 std::cerr << "Read calib data error " << status << std::endl;
1030 return status;
1031 }
1032 status = CompareOutput();
1033 if (status != RET_OK) {
1034 MS_LOG(ERROR) << "Compare output error " << status;
1035 std::cerr << "Compare output error " << status << std::endl;
1036 return status;
1037 }
1038 if (this->flags_->cosine_distance_threshold_ >= -1) {
1039 status = CompareOutputByCosineDistance(this->flags_->cosine_distance_threshold_);
1040 if (status != RET_OK) {
1041 MS_LOG(ERROR) << "Compare output error by consine distance " << status;
1042 std::cerr << "Compare output error by consine distance" << status << std::endl;
1043 return status;
1044 }
1045 }
1046 return RET_OK;
1047 }
1048
PrintInputData()1049 int BenchmarkUnifiedApi::PrintInputData() {
1050 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1051 mindspore::MSTensor input = ms_inputs_for_api_[i];
1052 auto tensor_data_type = static_cast<int>(input.DataType());
1053
1054 std::cout << "InData " << i << ": ";
1055 if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1056 MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1057 continue;
1058 }
1059 if (tensor_data_type == TypeId::kObjectTypeString) {
1060 std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1061 size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1062 for (size_t j = 0; j < print_num; j++) {
1063 std::cout << output_strings[j] << std::endl;
1064 }
1065 continue;
1066 }
1067 size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1068 const void *in_data = input.MutableData();
1069 if (in_data == nullptr) {
1070 MS_LOG(ERROR) << "in_data is nullptr.";
1071 return RET_ERROR;
1072 }
1073
1074 for (size_t j = 0; j < print_num; j++) {
1075 if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1076 std::cout << static_cast<const float *>(in_data)[j] << " ";
1077 } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1078 std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1079 } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1080 std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1081 } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1082 std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1083 } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1084 std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1085 } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1086 std::cout << static_cast<const bool *>(in_data)[j] << " ";
1087 } else {
1088 MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1089 return RET_ERROR;
1090 }
1091 }
1092 std::cout << std::endl;
1093 }
1094 return RET_OK;
1095 }
1096 #ifdef PARALLEL_INFERENCE
ModelParallelRunnerWarmUp(int index)1097 void BenchmarkUnifiedApi::ModelParallelRunnerWarmUp(int index) {
1098 auto in = model_runner_.GetInputs();
1099 for (size_t i = 0; i < in.size(); i++) {
1100 in[i].SetShape(resize_dims_[i]);
1101 in[i].SetData(all_inputs_data_[index][i], false);
1102 }
1103 auto warm_up_start = GetTimeUs();
1104 std::vector<MSTensor> output;
1105 auto ret = model_runner_.Predict(in, &output);
1106 for (size_t j = 0; j < in.size(); j++) {
1107 in[j].SetData(nullptr);
1108 }
1109 if (ret != kSuccess) {
1110 model_parallel_runner_ret_failed_ = true;
1111 MS_LOG(ERROR) << "model pool predict failed.";
1112 return;
1113 }
1114 auto warm_up_end = GetTimeUs();
1115 std::cout << "warm up index: " << index << " | time: " << (warm_up_end - warm_up_start) / kFloatMSEC << " ms\n";
1116 }
1117
ModelParallelRunnerRun(int task_num,int parallel_idx)1118 void BenchmarkUnifiedApi::ModelParallelRunnerRun(int task_num, int parallel_idx) {
1119 for (int i = 0; i < task_num || task_num == -1; i++) {
1120 while (!runner_run_start_) {
1121 continue;
1122 }
1123 int idx = parallel_idx + flags_->warm_up_loop_count_;
1124 auto in = model_runner_.GetInputs();
1125 if (idx >= static_cast<int>(all_inputs_data_.size())) {
1126 MS_LOG(ERROR) << "idx is to big :" << idx;
1127 return;
1128 }
1129 auto in_data = all_inputs_data_[idx];
1130 for (size_t tensor_index = 0; tensor_index < in.size(); tensor_index++) {
1131 in.at(tensor_index).SetShape(resize_dims_.at(tensor_index));
1132 in.at(tensor_index).SetData(all_inputs_data_.at(idx)[tensor_index], false);
1133 }
1134 auto predict_start = GetTimeUs();
1135 std::vector<MSTensor> output;
1136 auto ret = model_runner_.Predict(in, &output);
1137 if (ret != kSuccess) {
1138 model_parallel_runner_ret_failed_ = true;
1139 MS_LOG(ERROR) << "model pool predict failed.";
1140 for (auto &item : in) {
1141 item.SetData(nullptr);
1142 }
1143 return;
1144 }
1145 auto predict_end = GetTimeUs();
1146 std::cout << "parallel index: " << parallel_idx << " | task index: " << i
1147 << " | predict time: " << (predict_end - predict_start) / kFloatMSEC << " ms\n";
1148 for (size_t j = 0; j < in.size(); j++) {
1149 in[j].SetData(nullptr);
1150 }
1151 if (!flags_->benchmark_data_file_.empty()) {
1152 auto status = CompareOutputForModelPool(&output);
1153 if (status != RET_OK) {
1154 model_parallel_runner_ret_failed_ = true;
1155 MS_LOG(ERROR) << "Compare output error " << status;
1156 return;
1157 }
1158 }
1159 }
1160 }
1161
AddConfigInfo(const std::shared_ptr<RunnerConfig> & runner_config)1162 int BenchmarkUnifiedApi::AddConfigInfo(const std::shared_ptr<RunnerConfig> &runner_config) {
1163 if (!flags_->config_file_.empty()) {
1164 runner_config->SetConfigPath(flags_->config_file_);
1165 }
1166 std::map<std::string, std::string> config;
1167 if (flags_->enable_shared_thread_pool_) {
1168 config[kEnableSharedThreadPoolKey] = "true";
1169 if (!flags_->thread_num_limit_per_worker_.empty()) {
1170 config[kThreadNumLimitPerWorkerKey] = flags_->thread_num_limit_per_worker_;
1171 }
1172 if (!flags_->thread_num_remaining_per_worker_.empty()) {
1173 config[kThreadNumRemainingPerWorkerKey] = flags_->thread_num_remaining_per_worker_;
1174 }
1175 } else {
1176 config[kEnableSharedThreadPoolKey] = "false";
1177 }
1178 runner_config->SetConfigInfo(kSharedThreadPoolSection, config);
1179 return RET_OK;
1180 }
1181
ParallelInference(std::shared_ptr<mindspore::Context> context)1182 int BenchmarkUnifiedApi::ParallelInference(std::shared_ptr<mindspore::Context> context) {
1183 if (flags_->warm_up_loop_count_ > kMaxRequestNum || flags_->parallel_num_ > kMaxRequestNum) {
1184 MS_LOG(WARNING) << "in parallel predict warm up loop count should less than" << kMaxRequestNum;
1185 }
1186
1187 // model runner init
1188 auto runner_config = std::make_shared<RunnerConfig>();
1189 runner_config->SetContext(context);
1190 runner_config->SetWorkersNum(flags_->workers_num_);
1191 auto status = AddConfigInfo(runner_config);
1192 MS_CHECK_FALSE_MSG(status != kSuccess, RET_ERROR, "add config info for parallel predict failed.");
1193 auto model_init_start = GetTimeUs();
1194 auto ret = model_runner_.Init(flags_->model_file_, runner_config);
1195 MS_CHECK_FALSE_MSG(ret != kSuccess, RET_ERROR, "model pool init failed.");
1196 auto model_init_end = GetTimeUs();
1197
1198 // load data
1199 ms_inputs_for_api_ = model_runner_.GetInputs();
1200 MS_CHECK_FALSE_MSG(ms_inputs_for_api_.empty(), RET_ERROR, "model pool input is empty.");
1201 ms_outputs_for_api_ = model_runner_.GetOutputs();
1202 MS_CHECK_FALSE_MSG(ms_outputs_for_api_.empty(), RET_ERROR, "model pool output is empty.");
1203
1204 if (!flags_->graph_input_shape_map_.empty()) {
1205 // parse model input shapes from --inputShape flag
1206 std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(model_runner_.GetInputs());
1207 MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize dims empty.");
1208 (void)std::transform(resize_dims.begin(), resize_dims.end(), std::back_inserter(resize_dims_),
1209 [&](const auto &shapes) { return shapes; });
1210 } else {
1211 (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims_),
1212 [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1213 }
1214
1215 for (int i = 0; i < flags_->parallel_num_ + flags_->warm_up_loop_count_; i++) {
1216 status = LoadInput();
1217 MS_CHECK_FALSE_MSG(status != RET_OK, status, "Generate input data error");
1218 std::vector<MSTensor> output;
1219 all_outputs_.push_back(output);
1220 }
1221 if (!flags_->benchmark_data_file_.empty()) {
1222 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1223 auto &tensor = ms_inputs_for_api_[i];
1224 tensor.SetShape(resize_dims_[i]);
1225 tensor.SetData(all_inputs_data_[0][i], false);
1226 }
1227 status = PrintInputData();
1228 MS_CHECK_FALSE_MSG(status != RET_OK, status, "PrintInputData error ");
1229 status = ReadCalibData();
1230 MS_CHECK_FALSE_MSG(status != RET_OK, status, "ReadCalibData error ");
1231 }
1232
1233 // warm up
1234 std::vector<std::thread> model_thread_warm_up;
1235 for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
1236 model_thread_warm_up.push_back(std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerWarmUp, this, i));
1237 }
1238 for (auto &warm_up_thread : model_thread_warm_up) {
1239 warm_up_thread.join();
1240 }
1241 if (model_parallel_runner_ret_failed_) {
1242 return RET_ERROR;
1243 }
1244 std::cout << "=============== end warm up ===============\n";
1245 // do loop count
1246 std::vector<std::thread> model_thread_run;
1247 for (int parallel_num_idx = 0; parallel_num_idx < flags_->parallel_num_; parallel_num_idx++) {
1248 model_thread_run.push_back(
1249 std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerRun, this, flags_->parallel_task_num_, parallel_num_idx));
1250 }
1251 auto start_run_time = lite::GetTimeUs();
1252 runner_run_start_ = true;
1253 for (auto &run_thread : model_thread_run) {
1254 run_thread.join();
1255 }
1256 auto end_run_time = lite::GetTimeUs();
1257 if (model_parallel_runner_ret_failed_) {
1258 return RET_ERROR;
1259 }
1260 std::cout << "=================================" << std::endl;
1261 std::cout << "parallel predict init time: " << (model_init_end - model_init_start) / kFloatMSEC << " ms\n";
1262 std::cout << "parallel predict all run time: " << (end_run_time - start_run_time) / kFloatMSEC << " ms\n";
1263 std::cout << "=================================" << std::endl;
1264 return RET_OK;
1265 }
1266 #endif
1267
PrintOutputData()1268 int BenchmarkUnifiedApi::PrintOutputData() {
1269 for (size_t i = 0; i < ms_outputs_for_api_.size(); i++) {
1270 mindspore::MSTensor input = ms_outputs_for_api_[i];
1271 auto tensor_data_type = static_cast<int>(input.DataType());
1272
1273 std::cout << "OutData " << i << ": ";
1274 if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1275 MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1276 continue;
1277 }
1278 if (tensor_data_type == TypeId::kObjectTypeString) {
1279 std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1280 size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1281 for (size_t j = 0; j < print_num; j++) {
1282 std::cout << output_strings[j] << std::endl;
1283 }
1284 continue;
1285 }
1286 size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1287 const void *in_data = input.MutableData();
1288 if (in_data == nullptr) {
1289 MS_LOG(ERROR) << "out_data is nullptr.";
1290 return RET_ERROR;
1291 }
1292
1293 for (size_t j = 0; j < print_num; j++) {
1294 if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1295 std::cout << static_cast<const float *>(in_data)[j] << " ";
1296 } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1297 std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1298 } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1299 std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1300 } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1301 std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1302 } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1303 std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1304 } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1305 std::cout << static_cast<const bool *>(in_data)[j] << " ";
1306 } else {
1307 MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1308 return RET_ERROR;
1309 }
1310 }
1311 std::cout << std::endl;
1312 }
1313 return RET_OK;
1314 }
1315
CompileGraph(mindspore::ModelType model_type,const std::shared_ptr<Context> & context,const std::string & model_name)1316 int BenchmarkUnifiedApi::CompileGraph(mindspore::ModelType model_type, const std::shared_ptr<Context> &context,
1317 const std::string &model_name) {
1318 Key dec_key;
1319 if (!flags_->decrypt_key_str_.empty()) {
1320 dec_key.len = lite::Hex2ByteArray(flags_->decrypt_key_str_, dec_key.key, kEncMaxLen);
1321 if (dec_key.len == 0) {
1322 MS_LOG(ERROR) << "dec_key.len == 0";
1323 return RET_INPUT_PARAM_INVALID;
1324 }
1325 flags_->decrypt_key_str_.clear();
1326 }
1327 Status ret;
1328 if (flags_->crypto_lib_path_.empty()) {
1329 ret = ms_model_.Build(flags_->model_file_, model_type, context);
1330 } else {
1331 ret =
1332 ms_model_.Build(flags_->model_file_, model_type, context, dec_key, flags_->dec_mode_, flags_->crypto_lib_path_);
1333 }
1334 memset(dec_key.key, 0, kEncMaxLen);
1335 if (ret != kSuccess) {
1336 MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
1337 std::cout << "ms_model_.Build failed while running ", model_name.c_str();
1338 return RET_ERROR;
1339 }
1340 return RET_OK;
1341 }
1342
ParseGraphInputShapeMap(const std::vector<MSTensor> & inputs)1343 std::vector<std::vector<int64_t>> BenchmarkUnifiedApi::ParseGraphInputShapeMap(const std::vector<MSTensor> &inputs) {
1344 std::vector<std::vector<int64_t>> resize_dims;
1345 if (flags_->graph_input_shape_map_.size() != inputs.size()) {
1346 MS_LOG(ERROR) << "The number of inputs in the model does not match the parsed inputShape option. The model has ["
1347 << inputs.size() << "] input(s), while the parsed inputShape has ["
1348 << flags_->graph_input_shape_map_.size() << "] input(s).";
1349 return resize_dims;
1350 }
1351 for (auto &model_input : inputs) {
1352 if (flags_->graph_input_shape_map_.find(model_input.Name()) == flags_->graph_input_shape_map_.end()) {
1353 MS_LOG(ERROR) << "model input [" << model_input.Name()
1354 << "] is not found in inputShape option, please double check";
1355 MS_LOG(ERROR) << "model input names are as follows:";
1356 for (auto &mod_input : inputs) {
1357 MS_LOG(ERROR) << mod_input.Name();
1358 }
1359 MS_LOG(ERROR) << "user input names are as follows:";
1360 for (auto &user_input : flags_->graph_input_shape_map_) {
1361 MS_LOG(ERROR) << user_input.first;
1362 }
1363 return resize_dims;
1364 } else {
1365 auto shapes = flags_->graph_input_shape_map_[model_input.Name()];
1366 resize_dims.push_back(this->ConverterToInt64Vector(shapes));
1367 }
1368 }
1369 return resize_dims;
1370 }
1371
1372 #ifdef PARALLEL_INFERENCE
RunParallelBenchmark(std::shared_ptr<mindspore::Context> context)1373 int BenchmarkUnifiedApi::RunParallelBenchmark(std::shared_ptr<mindspore::Context> context) {
1374 if (flags_->resize_dims_.empty() && flags_->graph_input_shape_map_.empty()) {
1375 MS_LOG(ERROR) << "model input shapes should be provided when using parallel predict, please specify --inputShape";
1376 return RET_ERROR;
1377 }
1378 auto status = ParallelInference(context);
1379 MS_CHECK_FALSE_MSG(status != RET_OK, RET_ERROR, "run model pool failed.");
1380 return RET_OK;
1381 }
1382 #endif
1383
RunBenchmark()1384 int BenchmarkUnifiedApi::RunBenchmark() {
1385 auto start_prepare_time = GetTimeUs();
1386
1387 if (flags_->enable_gl_texture_) {
1388 if (!gl_runtime_.Init()) {
1389 MS_LOG(ERROR) << "opengl runtime init failed ";
1390 std::cerr << "opengl runtime init failed ";
1391 return RET_ERROR;
1392 }
1393 }
1394
1395 // Load graph
1396 std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
1397 auto iter = ModelTypeMap.find(flags_->model_type_);
1398 if (iter == ModelTypeMap.end()) {
1399 MS_LOG(ERROR) << "model_type " << flags_->model_type_ << " is invalid.";
1400 std::cerr << "model_type " << flags_->model_type_ << " is invalid.";
1401 return RET_ERROR;
1402 }
1403 mindspore::ModelType model_type = iter->second;
1404
1405 MS_LOG(INFO) << "start unified benchmark run";
1406 std::cout << "start unified benchmark run" << std::endl;
1407
1408 auto context = std::make_shared<mindspore::Context>();
1409 if (context == nullptr) {
1410 MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
1411 std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
1412 return RET_ERROR;
1413 }
1414
1415 auto status = InitMSContext(context);
1416 if (status != RET_OK) {
1417 MS_LOG(ERROR) << "InitMSContext failed while running " << model_name.c_str();
1418 std::cout << "InitMSContext failed while running " << model_name.c_str();
1419 return RET_ERROR;
1420 }
1421
1422 (void)UpdateDistributionName(context, &flags_->model_file_);
1423 (void)UpdateDistributionName(context, &flags_->benchmark_data_file_);
1424 (void)UpdateDistributionName(context, &flags_->config_file_);
1425
1426 if (!flags_->config_file_.empty()) {
1427 auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
1428 if (config_ret != kSuccess) {
1429 MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1430 std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1431 }
1432 }
1433
1434 UpdateConfigInfo();
1435 #ifdef PARALLEL_INFERENCE
1436 if (flags_->enable_parallel_predict_) {
1437 MS_CHECK_FALSE_MSG(RunParallelBenchmark(context) != RET_OK, RET_ERROR, "run model pool failed.");
1438 return RET_OK;
1439 }
1440 #endif
1441
1442 status = CompileGraph(model_type, context, model_name);
1443 MS_CHECK_FALSE_MSG(status != RET_OK, status, "Compile graph failed.");
1444 if (!flags_->graph_input_shape_map_.empty()) {
1445 std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(ms_model_.GetInputs());
1446 MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize_dims is empty");
1447 auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1448 if (ret != kSuccess) {
1449 MS_LOG(ERROR) << "Input tensor resize failed.";
1450 std::cout << "Input tensor resize failed.";
1451 return RET_ERROR;
1452 }
1453 } else if (!flags_->resize_dims_.empty()) {
1454 std::vector<std::vector<int64_t>> resize_dims;
1455 (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
1456 [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1457
1458 auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1459 if (ret != kSuccess) {
1460 MS_LOG(ERROR) << "Input tensor resize failed.";
1461 std::cout << "Input tensor resize failed.";
1462 return RET_ERROR;
1463 }
1464 }
1465
1466 ms_inputs_for_api_ = ms_model_.GetInputs();
1467 ms_outputs_for_api_ = ms_model_.GetOutputs();
1468 auto end_prepare_time = GetTimeUs();
1469 MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
1470 std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;
1471
1472 // Load input
1473 MS_LOG(INFO) << "start generate input data";
1474 status = LoadInput();
1475 if (status != RET_OK) {
1476 MS_LOG(ERROR) << "Generate input data error";
1477 return status;
1478 }
1479 return GetBenchmarkResult();
1480 }
1481
GetBenchmarkResult()1482 int BenchmarkUnifiedApi::GetBenchmarkResult() {
1483 if (!flags_->benchmark_data_file_.empty()) {
1484 auto status = MarkAccuracy();
1485 if (status != RET_OK) {
1486 MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
1487 std::cout << "Run MarkAccuracy error: " << status << std::endl;
1488 return status;
1489 }
1490 } else {
1491 auto status = MarkPerformance();
1492 if (status != RET_OK) {
1493 MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
1494 std::cout << "Run MarkPerformance error: " << status << std::endl;
1495 return status;
1496 }
1497 }
1498 if (flags_->dump_tensor_data_) {
1499 std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
1500 }
1501 Status finalize_ret = ms_model_.Finalize();
1502 if (finalize_ret == kSuccess) {
1503 MS_LOG(INFO) << "Benchmark finalize executed success.";
1504 }
1505 return RET_OK;
1506 }
1507
InitTimeProfilingCallbackParameter()1508 int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
1509 if (flags_->inter_op_parallel_num_ > 1) {
1510 // before callback
1511 ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1512 const std::vector<mindspore::MSTensor> &before_outputs,
1513 const MSCallBackParam &call_param) {
1514 if (before_inputs.empty()) {
1515 MS_LOG(INFO) << "The num of beforeInputs is empty";
1516 }
1517 if (before_outputs.empty()) {
1518 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1519 }
1520 {
1521 std::lock_guard<std::mutex> _l(op_times_mutex_);
1522 if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1523 op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1524 }
1525 if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1526 op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1527 }
1528 op_start_times_by_name_[call_param.node_name] = GetTimeUs();
1529 op_call_times_total_++;
1530 }
1531 return true;
1532 };
1533
1534 // after callback
1535 ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1536 const std::vector<mindspore::MSTensor> &after_outputs,
1537 const MSCallBackParam &call_param) {
1538 uint64_t opEnd = GetTimeUs();
1539
1540 if (after_inputs.empty()) {
1541 MS_LOG(INFO) << "The num of after inputs is empty";
1542 }
1543 if (after_outputs.empty()) {
1544 MS_LOG(INFO) << "The num of after outputs is empty";
1545 }
1546 {
1547 std::lock_guard<std::mutex> _l(op_times_mutex_);
1548 float cost = static_cast<float>(opEnd - op_start_times_by_name_[call_param.node_name]) / kFloatMSEC;
1549 if (flags_->device_ == "GPU") {
1550 cost = static_cast<float>(call_param.execute_time);
1551 }
1552 op_cost_total_ += cost;
1553 op_times_by_type_[call_param.node_type].first++;
1554 op_times_by_type_[call_param.node_type].second += cost;
1555 op_times_by_name_[call_param.node_name].first++;
1556 op_times_by_name_[call_param.node_name].second += cost;
1557 }
1558 return true;
1559 };
1560 } else {
1561 // before callback
1562 ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1563 const std::vector<mindspore::MSTensor> &before_outputs,
1564 const MSCallBackParam &call_param) {
1565 if (before_inputs.empty()) {
1566 MS_LOG(INFO) << "The num of beforeInputs is empty";
1567 }
1568 if (before_outputs.empty()) {
1569 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1570 }
1571 if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1572 op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1573 }
1574 if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1575 op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1576 }
1577
1578 op_call_times_total_++;
1579 op_begin_ = GetTimeUs();
1580 return true;
1581 };
1582
1583 // after callback
1584 ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1585 const std::vector<mindspore::MSTensor> &after_outputs,
1586 const MSCallBackParam &call_param) {
1587 uint64_t opEnd = GetTimeUs();
1588
1589 if (after_inputs.empty()) {
1590 MS_LOG(INFO) << "The num of after inputs is empty";
1591 }
1592 if (after_outputs.empty()) {
1593 MS_LOG(INFO) << "The num of after outputs is empty";
1594 }
1595
1596 float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
1597 if (flags_->device_ == "GPU") {
1598 cost = static_cast<float>(call_param.execute_time);
1599 }
1600 op_cost_total_ += cost;
1601 op_times_by_type_[call_param.node_type].first++;
1602 op_times_by_type_[call_param.node_type].second += cost;
1603 op_times_by_name_[call_param.node_name].first++;
1604 op_times_by_name_[call_param.node_name].second += cost;
1605 return true;
1606 };
1607 }
1608 return RET_OK;
1609 }
1610
InitPerfProfilingCallbackParameter()1611 int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
1612 #ifndef ENABLE_ARM64
1613 MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
1614 return RET_ERROR;
1615 #else
1616 struct perf_event_attr pe, pe2;
1617 memset(&pe, 0, sizeof(struct perf_event_attr));
1618 memset(&pe2, 0, sizeof(struct perf_event_attr));
1619 pe.type = PERF_TYPE_HARDWARE;
1620 pe2.type = PERF_TYPE_HARDWARE;
1621 pe.size = sizeof(struct perf_event_attr);
1622 pe2.size = sizeof(struct perf_event_attr);
1623 pe.disabled = 1;
1624 pe2.disabled = 1;
1625 pe.exclude_kernel = 1; // don't count kernel
1626 pe2.exclude_kernel = 1; // don't count kernel
1627 pe.exclude_hv = 1; // don't count hypervisor
1628 pe2.exclude_hv = 1; // don't count hypervisor
1629 pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1630 pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1631 if (flags_->perf_event_ == "CACHE") {
1632 pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
1633 pe2.config = PERF_COUNT_HW_CACHE_MISSES;
1634 } else if (flags_->perf_event_ == "STALL") {
1635 pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
1636 pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
1637 } else {
1638 pe.config = PERF_COUNT_HW_CPU_CYCLES;
1639 pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
1640 }
1641 perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
1642 if (perf_fd == -1) {
1643 MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
1644 return RET_ERROR;
1645 }
1646 perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
1647 if (perf_fd2 == -1) {
1648 MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
1649 return RET_ERROR;
1650 }
1651 struct PerfCount zero;
1652 zero.value[0] = 0;
1653 zero.value[1] = 0;
1654 // before callback
1655 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1656 const std::vector<mindspore::MSTensor> &before_outputs,
1657 const MSCallBackParam &call_param) {
1658 if (before_inputs.empty()) {
1659 MS_LOG(INFO) << "The num of beforeInputs is empty";
1660 }
1661 if (before_outputs.empty()) {
1662 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1663 }
1664 if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
1665 op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
1666 }
1667 if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
1668 op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
1669 }
1670
1671 op_call_times_total_++;
1672 ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
1673 ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
1674 return true;
1675 };
1676
1677 // after callback
1678 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1679 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1680 struct PerfResult res;
1681 ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
1682 if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
1683 MS_LOG(ERROR) << "Failed to read perf_fd";
1684 return false;
1685 }
1686
1687 if (after_inputs.empty()) {
1688 MS_LOG(INFO) << "The num of after inputs is empty";
1689 }
1690 if (after_outputs.empty()) {
1691 MS_LOG(INFO) << "The num of after outputs is empty";
1692 }
1693 float cost1 = static_cast<float>(res.values[0].value);
1694 float cost2 = static_cast<float>(res.values[1].value);
1695 op_cost_total_ += cost1;
1696 op_cost2_total_ += cost2;
1697 op_perf_by_type_[call_param.node_type].first++;
1698 op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
1699 op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
1700 op_perf_by_name_[call_param.node_name].first++;
1701 op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
1702 op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
1703 return true;
1704 };
1705 #endif
1706 return RET_OK;
1707 }
1708
1709 namespace {
1710 template <typename T>
DataToString(void * data,size_t data_number)1711 std::string DataToString(void *data, size_t data_number) {
1712 if (data == nullptr) {
1713 return "Data of tensor is nullptr";
1714 }
1715 std::ostringstream oss;
1716 auto casted_data = static_cast<T *>(data);
1717 for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
1718 oss << " " << casted_data[i];
1719 }
1720 return oss.str();
1721 }
1722
DumpMSTensor(mindspore::MSTensor * tensor)1723 std::string DumpMSTensor(mindspore::MSTensor *tensor) {
1724 if (tensor == nullptr) {
1725 return "Tensor is nullptr";
1726 }
1727 std::ostringstream oss;
1728 oss << " DataType: " << static_cast<int>(tensor->DataType());
1729 oss << " Shape:";
1730 for (auto &dim : tensor->Shape()) {
1731 oss << " " << dim;
1732 }
1733 oss << std::endl << " Data:";
1734 switch (static_cast<int>(tensor->DataType())) {
1735 case kNumberTypeFloat32: {
1736 oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
1737 } break;
1738 case kNumberTypeFloat16: {
1739 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1740 } break;
1741 case kNumberTypeInt32: {
1742 oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
1743 } break;
1744 case kNumberTypeInt16: {
1745 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1746 } break;
1747 case kNumberTypeInt8: {
1748 oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
1749 } break;
1750 default:
1751 oss << "Unsupported data type to print";
1752 break;
1753 }
1754 return oss.str();
1755 }
1756 #ifndef BENCHMARK_CLIP_JSON
GenerateOutputFileName(mindspore::MSTensor * tensor,const std::string & op_name,const std::string & file_type,const size_t & idx)1757 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
1758 const std::string &file_type, const size_t &idx) {
1759 std::string file_name = op_name;
1760 auto pos = file_name.find_first_of('/');
1761 while (pos != std::string::npos) {
1762 file_name.replace(pos, 1, ".");
1763 pos = file_name.find_first_of('/');
1764 }
1765 file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
1766 for (const auto &dim : tensor->Shape()) {
1767 file_name += std::to_string(dim) + "_";
1768 }
1769 if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
1770 file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
1771 }
1772 auto tensor_format = tensor->format();
1773 if (kTensorFormatMap.find(tensor_format) != kTensorFormatMap.end()) {
1774 file_name += "_" + kTensorFormatMap.at(tensor_format) + ".bin";
1775 } else {
1776 file_name += +".bin";
1777 }
1778
1779 return file_name;
1780 }
1781 #endif
1782 } // namespace
1783
InitPrintTensorDataCallbackParameter()1784 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
1785 // before callback
1786 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1787 const std::vector<mindspore::MSTensor> &before_outputs,
1788 const MSCallBackParam &call_param) { return true; };
1789
1790 // after callback
1791 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1792 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1793 std::cout << "================================================================" << std::endl;
1794 std::cout << call_param.node_name << " inputs : " << std::endl;
1795 for (auto ms_tensor : after_inputs) {
1796 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1797 }
1798 std::cout << "----------------------------------------------------------------" << std::endl;
1799 std::cout << call_param.node_name << " outputs : " << std::endl;
1800 for (auto ms_tensor : after_outputs) {
1801 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1802 }
1803 std::cout << "================================================================" << std::endl;
1804 return true;
1805 };
1806 return RET_OK;
1807 }
InitDumpTensorDataCallbackParameter()1808 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
1809 #ifndef BENCHMARK_CLIP_JSON
1810 // before callback
1811 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1812 const std::vector<mindspore::MSTensor> &before_outputs,
1813 const MSCallBackParam &call_param) {
1814 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1815 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1816 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1817 if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1818 if (input_output_mode == 0 || input_output_mode == 1) {
1819 for (size_t i = 0; i < before_inputs.size(); i++) {
1820 auto ms_tensor = before_inputs.at(i);
1821 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
1822 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1823 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
1824 MS_LOG(ERROR) << "write tensor data to file failed.";
1825 return false;
1826 }
1827 }
1828 }
1829 }
1830 return true;
1831 };
1832
1833 // after callback
1834 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1835 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1836 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1837 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1838 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1839 if (dump_mode == kDumpInputsAndOutputs ||
1840 std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1841 if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
1842 for (size_t i = 0; i < after_outputs.size(); i++) {
1843 auto ms_tensor = after_outputs.at(i);
1844 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
1845 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1846 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
1847 MS_LOG(ERROR) << "write tensor data to file failed.";
1848 return false;
1849 }
1850 }
1851 }
1852 }
1853 return true;
1854 };
1855 #endif
1856 return RET_OK;
1857 }
1858
~BenchmarkUnifiedApi()1859 BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {
1860 #ifdef PARALLEL_INFERENCE
1861 if (!flags_->enable_parallel_predict_) {
1862 return;
1863 }
1864 for (auto tensor : ms_inputs_for_api_) {
1865 auto data = tensor.MutableData();
1866 if (data != nullptr) {
1867 tensor.SetData(nullptr);
1868 }
1869 }
1870 for (auto &input : all_inputs_data_) {
1871 for (auto &data : input) {
1872 if (data != nullptr) {
1873 auto buf = static_cast<char *>(data);
1874 delete[] buf;
1875 data = nullptr;
1876 }
1877 }
1878 }
1879 #endif
1880 }
1881 } // namespace lite
1882 } // namespace mindspore
1883