1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "tools/benchmark/benchmark_unified_api.h"
18 #include <cinttypes>
19 #include <algorithm>
20 #include <utility>
21 #include <functional>
22 #include <iomanip>
23 #include <limits>
24 #include "src/common/common.h"
25 #include "src/tensor.h"
26 #include "tools/common/string_util.h"
27 #include "nnacl/nnacl_common.h"
28 #ifdef ENABLE_ARM64
29 #include <linux/perf_event.h>
30 #include <sys/ioctl.h>
31 #include <asm/unistd.h>
32 #include <unistd.h>
33 #endif
34 #ifdef SUPPORT_NNIE
35 #include "include/hi_common.h"
36 #include "include/hi_comm_vb.h"
37 #include "include/mpi_sys.h"
38 #include "include/mpi_vb.h"
39 #endif
40 #ifdef PARALLEL_INFERENCE
41 #include <thread>
42 #include "src/common/config_file.h"
43 #endif
44 #include "include/c_api/model_c.h"
45 #include "include/c_api/context_c.h"
46
47 namespace mindspore {
48 constexpr size_t kDataToStringMaxNum = 40;
49 constexpr int kPrintDataNum = 20;
50 constexpr int kFrequencyDefault = 3;
51 constexpr int kPercentageDivisor = 100;
52 constexpr int kDumpInputsAndOutputs = 0;
53 constexpr int kDumpOutputs = 2;
54 #ifdef PARALLEL_INFERENCE
55 constexpr int kMaxRequestNum = 200;
56 #endif
57 namespace lite {
GenerateGLTexture(std::map<std::string,GLuint> * input_gl_texture)58 int BenchmarkUnifiedApi::GenerateGLTexture(std::map<std::string, GLuint> *input_gl_texture) {
59 for (auto tensor : ms_inputs_for_api_) {
60 float *input_data = reinterpret_cast<float *>(malloc(tensor.DataSize()));
61 if (input_data == nullptr) {
62 MS_LOG(ERROR) << "new input_data failed";
63 return RET_ERROR;
64 }
65 int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
66 if (status != RET_OK) {
67 free(input_data);
68 std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
69 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
70 return status;
71 }
72 status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), input_data);
73 free(input_data);
74 if (status != RET_OK) {
75 MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
76 return status;
77 }
78 }
79
80 return RET_OK;
81 }
82
FillGLTextureToTensor(std::map<std::string,GLuint> * gl_texture,mindspore::MSTensor * tensor,std::string name,void * data)83 int BenchmarkUnifiedApi::FillGLTextureToTensor(std::map<std::string, GLuint> *gl_texture, mindspore::MSTensor *tensor,
84 std::string name, void *data) {
85 MS_CHECK_TRUE_MSG(gl_texture != nullptr, RET_ERROR, "gl_texture is nullptr");
86 MS_CHECK_TRUE_MSG(tensor != nullptr, RET_ERROR, "tensor is nullptr");
87
88 auto image_id = 0;
89
90 int width = 1, height = 1, channel = 1;
91 if (tensor->Shape().size() == DIMENSION_2D) {
92 height = tensor->Shape()[kNHWC_N];
93 channel = tensor->Shape()[kNHWC_H];
94 } else if (tensor->Shape().size() == DIMENSION_3D) {
95 width = tensor->Shape()[kNHWC_H];
96 height = tensor->Shape()[kNHWC_N];
97 channel = tensor->Shape()[kNHWC_C];
98 } else if (tensor->Shape().size() == DIMENSION_4D) {
99 width = tensor->Shape()[kNHWC_W];
100 height = tensor->Shape()[kNHWC_H];
101 channel = tensor->Shape()[kNHWC_C];
102 } else {
103 MS_LOG(ERROR) << "the tensor shape is not support";
104 return RET_ERROR;
105 }
106
107 if (data == nullptr) {
108 image_id = gl_runtime_.GLCreateTexture(width, height, channel);
109 } else {
110 image_id = gl_runtime_.CopyHostToDeviceTexture(data, width, height, channel);
111 }
112
113 if (image_id != GL_NONE) {
114 gl_texture->insert(std::pair<std::string, GLuint>(name, image_id));
115 } else {
116 MS_LOG(ERROR) << "glMemPool CopyHostToDeviceTexture failed";
117 }
118 return RET_OK;
119 }
120
LoadAndBindGLTexture()121 int BenchmarkUnifiedApi::LoadAndBindGLTexture() {
122 std::map<std::string, GLuint> input_gl_texture;
123 std::map<std::string, GLuint> output_gl_texture;
124
125 if (flags_->in_data_file_.empty()) {
126 auto status = GenerateGLTexture(&input_gl_texture);
127 if (status != RET_OK) {
128 std::cerr << "Generate input GLTexture error " << status << std::endl;
129 MS_LOG(ERROR) << "Generate input GLTexture error " << status;
130 return status;
131 }
132 } else {
133 auto status = ReadGLTextureFile(&input_gl_texture);
134 if (status != RET_OK) {
135 std::cerr << "ReadGLTextureFile error, " << status << std::endl;
136 MS_LOG(ERROR) << "ReadGLTextureFile error, " << status;
137 return status;
138 }
139 }
140
141 for (auto &tensor : ms_outputs_for_api_) {
142 auto status = FillGLTextureToTensor(&output_gl_texture, &tensor, tensor.Name());
143 if (status != RET_OK) {
144 MS_LOG(ERROR) << "Fill GLTexture to output tensor" << status;
145 return status;
146 }
147 }
148
149 auto status = ms_model_.BindGLTexture2DMemory(input_gl_texture, &output_gl_texture);
150 if (status != kSuccess) {
151 MS_LOG(ERROR) << "BindGLTexture2DMemory failed";
152 return RET_ERROR;
153 }
154 return RET_OK;
155 }
156
ReadGLTextureFile(std::map<std::string,GLuint> * input_gl_texture)157 int BenchmarkUnifiedApi::ReadGLTextureFile(std::map<std::string, GLuint> *input_gl_texture) {
158 if (ms_inputs_for_api_.empty()) {
159 return RET_OK;
160 }
161 if (this->flags_->in_data_type_ == kImage) {
162 MS_LOG(ERROR) << "Not supported image input";
163 return RET_ERROR;
164 } else {
165 for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
166 auto tensor = ms_inputs_for_api_.at(i);
167 size_t size;
168 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
169 if (bin_buf == nullptr) {
170 MS_LOG(ERROR) << "ReadFile return nullptr";
171 return RET_ERROR;
172 }
173 auto tensor_data_size = tensor.DataSize();
174 if (size != tensor_data_size) {
175 std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
176 << std::endl;
177 MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
178 delete[] bin_buf;
179 return RET_ERROR;
180 }
181
182 auto status = FillGLTextureToTensor(input_gl_texture, &tensor, tensor.Name(), bin_buf);
183 delete[] bin_buf;
184 if (status != RET_OK) {
185 MS_LOG(ERROR) << "Fill GLTexture to input tensor" << status;
186 return status;
187 }
188 }
189 }
190
191 return RET_OK;
192 }
193
LoadInput()194 int BenchmarkUnifiedApi::LoadInput() {
195 if (flags_->enable_gl_texture_ == true) {
196 if (lite::BenchmarkUnifiedApi::LoadAndBindGLTexture() != RET_OK) {
197 MS_LOG(ERROR) << "Generate input GLTexture error";
198 return RET_ERROR;
199 }
200 return RET_OK;
201 }
202
203 if (flags_->in_data_file_.empty()) {
204 auto status = GenerateInputData();
205 if (status != RET_OK) {
206 std::cerr << "Generate input data error " << status << std::endl;
207 MS_LOG(ERROR) << "Generate input data error " << status;
208 return status;
209 }
210 } else {
211 auto status = ReadInputFile();
212 if (status != RET_OK) {
213 std::cerr << "ReadInputFile error, " << status << std::endl;
214 MS_LOG(ERROR) << "ReadInputFile error, " << status;
215 return status;
216 }
217 }
218 return RET_OK;
219 }
220
GenerateInputData()221 int BenchmarkUnifiedApi::GenerateInputData() {
222 #ifdef PARALLEL_INFERENCE
223 if (flags_->enable_parallel_predict_) {
224 std::vector<void *> inputs;
225 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
226 auto tensor_name = ms_inputs_for_api_[i].Name();
227 size_t size;
228 if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeFloat32)) {
229 size = sizeof(float);
230 } else if (ms_inputs_for_api_[i].DataType() == static_cast<enum DataType>(kNumberTypeInt32)) {
231 size = sizeof(int32_t);
232 } else {
233 MS_LOG(ERROR) << "not support in model pool.";
234 return RET_ERROR;
235 }
236 for (size_t j = 0; j < flags_->resize_dims_[i].size(); j++) {
237 size *= flags_->resize_dims_[i][j];
238 }
239 void *input_data = new (std::nothrow) char[size];
240 if (input_data == nullptr) {
241 MS_LOG(ERROR) << "new input_data failed";
242 for (auto &data : inputs) {
243 auto buf = static_cast<char *>(data);
244 delete[] buf;
245 data = nullptr;
246 }
247 return RET_ERROR;
248 }
249 inputs.push_back(input_data);
250 int status = GenerateRandomData(size, input_data, static_cast<int>(ms_inputs_for_api_[i].DataType()));
251 if (status != RET_OK) {
252 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
253 for (auto &data : inputs) {
254 auto buf = static_cast<char *>(data);
255 delete[] buf;
256 data = nullptr;
257 }
258 return status;
259 }
260 }
261 all_inputs_data_.push_back(inputs);
262 return RET_OK;
263 }
264 #endif
265 for (auto &tensor : ms_inputs_for_api_) {
266 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
267 MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
268 if (input == nullptr) {
269 std::cerr << "StringsToTensor failed" << std::endl;
270 MS_LOG(ERROR) << "StringsToTensor failed";
271 return RET_ERROR;
272 }
273 tensor = *input;
274 delete input;
275 } else {
276 auto input_data = tensor.MutableData();
277 if (input_data == nullptr) {
278 MS_LOG(ERROR) << "MallocData for inTensor failed";
279 return RET_ERROR;
280 }
281 int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
282 if (status != RET_OK) {
283 std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
284 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
285 return status;
286 }
287 }
288 }
289 return RET_OK;
290 }
291
UpdateConfigInfo()292 void BenchmarkUnifiedApi::UpdateConfigInfo() {
293 #define WIPE_DEEP_CONFIG_ENV '0'
294 #define WIPE_DEEP_CONFIG_VOCAB_SIZE "100"
295 #define WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE "40"
296
297 auto env = std::getenv("BENCHMARK_UPDATE_CONFIG_ENV");
298 if (env == nullptr) {
299 return;
300 }
301 if (env[0] == WIPE_DEEP_CONFIG_ENV) {
302 ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheVocabSizeKey, WIPE_DEEP_CONFIG_VOCAB_SIZE));
303 ms_model_.UpdateConfig(kMSCacheSection, std::make_pair(kMSCacheDeviceSizeKey, WIPE_DEEP_CONFIG_DEVICE_CACHE_SIZE));
304 }
305 return;
306 }
307
ReadInputFile()308 int BenchmarkUnifiedApi::ReadInputFile() {
309 #ifdef PARALLEL_INFERENCE
310 if (flags_->enable_parallel_predict_) {
311 std::vector<void *> inputs;
312 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
313 size_t size;
314 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
315 if (bin_buf == nullptr) {
316 MS_LOG(ERROR) << "ReadFile return nullptr";
317 for (auto &data : inputs) {
318 auto buf = static_cast<char *>(data);
319 delete[] buf;
320 data = nullptr;
321 }
322 return RET_ERROR;
323 }
324 inputs.push_back(bin_buf);
325 }
326 all_inputs_data_.push_back(inputs);
327 return RET_OK;
328 }
329 #endif
330 if (ms_inputs_for_api_.empty()) {
331 return RET_OK;
332 }
333
334 if (this->flags_->in_data_type_ == kImage) {
335 MS_LOG(ERROR) << "Not supported image input";
336 return RET_ERROR;
337 } else {
338 for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
339 auto &cur_tensor = ms_inputs_for_api_.at(i);
340 size_t size;
341 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
342 if (bin_buf == nullptr) {
343 MS_LOG(ERROR) << "ReadFile return nullptr";
344 return RET_ERROR;
345 }
346 if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
347 std::string str(bin_buf, size);
348 MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
349 if (input == nullptr) {
350 std::cerr << "StringsToTensor failed" << std::endl;
351 MS_LOG(ERROR) << "StringsToTensor failed";
352 delete[] bin_buf;
353 return RET_ERROR;
354 }
355 cur_tensor = *input;
356 } else {
357 auto tensor_data_size = cur_tensor.DataSize();
358 if (size != tensor_data_size) {
359 std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
360 << std::endl;
361 MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
362 delete[] bin_buf;
363 return RET_ERROR;
364 }
365 auto input_data = cur_tensor.MutableData();
366 if (input_data == nullptr) {
367 MS_LOG(ERROR) << "input_data is nullptr.";
368 delete[] bin_buf;
369 return RET_ERROR;
370 }
371 memcpy(input_data, bin_buf, tensor_data_size);
372 }
373 delete[] bin_buf;
374 }
375 }
376 return RET_OK;
377 }
378
GetDataTypeByTensorName(const std::string & tensor_name)379 int BenchmarkUnifiedApi::GetDataTypeByTensorName(const std::string &tensor_name) {
380 #ifdef PARALLEL_INFERENCE
381 for (auto tensor : ms_outputs_for_api_) {
382 auto name = tensor.Name();
383 if (name == tensor_name) {
384 return static_cast<int>(tensor.DataType());
385 }
386 }
387 MS_LOG(ERROR) << "not find tensor name : " << tensor_name << " in model output.";
388 return static_cast<int>(DataType::kTypeUnknown);
389 #endif
390 return static_cast<int>(ms_model_.GetOutputByTensorName(tensor_name).DataType());
391 }
392
UpdateDistributionName(const std::shared_ptr<mindspore::Context> & context,std::string * name)393 void BenchmarkUnifiedApi::UpdateDistributionName(const std::shared_ptr<mindspore::Context> &context,
394 std::string *name) {
395 if (flags_->device_ != "GPU") {
396 return;
397 }
398
399 if (name->size() == 0) {
400 return;
401 }
402
403 if (context->MutableDeviceInfo().size() == 0) {
404 return;
405 }
406
407 auto device_info = context->MutableDeviceInfo().front();
408 GPUDeviceInfo *gpu_info = reinterpret_cast<GPUDeviceInfo *>(device_info.get());
409 auto rank_id = gpu_info->GetRankID();
410 if (rank_id == 0) {
411 return;
412 }
413 gpu_info->SetDeviceID(rank_id);
414
415 /* model file & benchmark data file: include .mindir
416 config file : include .config */
417 auto replace_pos = name->find(".mindir");
418 if (replace_pos == std::string::npos) {
419 replace_pos = name->find(".config");
420 }
421
422 if (replace_pos == std::string::npos) {
423 return;
424 }
425
426 *name = name->replace(replace_pos, sizeof('.'), std::to_string(rank_id) + ".");
427
428 MS_LOG(INFO) << "Update distribution info: " << *name;
429 std::cout << "Update distribution info: " << *name << std::endl;
430 return;
431 }
432
InitMSContextForGPU(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)433 void BenchmarkUnifiedApi::InitMSContextForGPU(const std::shared_ptr<mindspore::Context> &context,
434 std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
435 std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
436 gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
437 uint32_t device_id = 0;
438 auto device_id_env = std::getenv("GPU_DEVICE_ID");
439 if (device_id_env != nullptr) {
440 // try {
441 device_id = static_cast<uint32_t>(std::stoul(device_id_env));
442 // } catch (std::invalid_argument &e) {
443 // MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
444 // }
445 MS_LOG(INFO) << "GPU device_id = " << device_id;
446 }
447 gpu_device_info->SetDeviceID(device_id);
448 if (flags_->device_id_ >= 0) {
449 gpu_device_info->SetDeviceID(flags_->device_id_);
450 MS_LOG(INFO) << "GPU device_id = " << flags_->device_id_;
451 }
452 if (flags_->enable_gl_texture_) {
453 gpu_device_info->SetEnableGLTexture(flags_->enable_gl_texture_);
454
455 auto gl_context = eglGetCurrentContext();
456 gpu_device_info->SetGLContext(gl_context);
457
458 auto gl_display = eglGetCurrentDisplay();
459 gpu_device_info->SetGLDisplay(gl_display);
460 } else {
461 gpu_device_info->SetProvider("tensorrt");
462 gpu_device_info->SetAllocator(nullptr);
463 }
464 device_list->push_back(gpu_device_info);
465 }
466
InitMSContextForAscend(const std::shared_ptr<mindspore::Context> & context,std::vector<std::shared_ptr<DeviceInfoContext>> * device_list)467 void BenchmarkUnifiedApi::InitMSContextForAscend(const std::shared_ptr<mindspore::Context> &context,
468 std::vector<std::shared_ptr<DeviceInfoContext>> *device_list) {
469 uint32_t device_id = 0;
470 auto device_id_env = std::getenv("ASCEND_DEVICE_ID");
471 if (device_id_env != nullptr) {
472 // try {
473 device_id = static_cast<uint32_t>(std::stoul(device_id_env));
474 // } catch (std::invalid_argument &e) {
475 // MS_LOG(WARNING) << "Invalid device id env:" << device_id_env << ". Set default device id 0.";
476 // }
477 MS_LOG(INFO) << "Ascend device_id = " << device_id;
478 }
479 std::shared_ptr<AscendDeviceInfo> ascend_device_info = std::make_shared<AscendDeviceInfo>();
480 ascend_device_info->SetDeviceID(device_id);
481 ascend_device_info->SetProvider(flags_->provider_);
482 auto back_policy_env = std::getenv("ASCEND_BACK_POLICY");
483 if (back_policy_env != nullptr) {
484 ascend_device_info->SetProvider(back_policy_env);
485 }
486 #ifdef ENABLE_CLOUD_FUSION_INFERENCE
487 if (flags_->device_id_ >= 0 && flags_->rank_id_ >= 0) {
488 ascend_device_info->SetDeviceID(flags_->device_id_);
489 ascend_device_info->SetRankID(flags_->rank_id_);
490 ascend_device_info->SetProvider("ge");
491 }
492 #endif
493 device_list->push_back(ascend_device_info);
494 }
495
InitMSContext(const std::shared_ptr<mindspore::Context> & context)496 int BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
497 context->SetThreadNum(flags_->num_threads_);
498 context->SetGroupInfoFile(flags_->group_info_file_);
499 context->SetThreadAffinity(flags_->cpu_bind_mode_);
500 context->SetInterOpParallelNum(flags_->inter_op_parallel_num_);
501 if (!flags_->core_list_.empty()) {
502 context->SetThreadAffinity(flags_->core_list_);
503 }
504 #ifndef ENABLE_CLOUD_FUSION_INFERENCE
505 if (flags_->delegate_mode_ == "CoreML") {
506 context->SetBuiltInDelegate(kCoreML);
507 } else if (flags_->delegate_mode_ == "NNAPI") {
508 context->SetBuiltInDelegate(kNNAPI);
509 }
510 context->SetEnableParallel(flags_->enable_parallel_);
511 #endif
512
513 auto &device_list = context->MutableDeviceInfo();
514 if (flags_->device_ == "GPU" || flags_->device_ == "Auto") {
515 InitMSContextForGPU(context, &device_list);
516 }
517
518 if (flags_->device_ == "NPU" || flags_->device_ == "Auto") {
519 std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
520 npu_device_info->SetEnableFP16(flags_->enable_fp16_);
521 npu_device_info->SetFrequency(kFrequencyDefault);
522 device_list.push_back(npu_device_info);
523 }
524
525 if (flags_->device_ == "Ascend" || flags_->device_ == "Auto") {
526 MS_LOG(ERROR) << "OHOS not support Ascend devices.";
527 return RET_NOT_SUPPORT;
528 }
529
530 if (flags_->device_ == "NNRT" || flags_->device_ == "Auto") {
531 std::shared_ptr<NNRTDeviceInfo> nnrt_device_info = std::make_shared<NNRTDeviceInfo>();
532 size_t num = 0;
533 auto descs = OH_AI_GetAllNNRTDeviceDescs(&num);
534 NNRTDeviceDesc *desc_nnrt = nullptr;
535 for (size_t i = 0; i < num; i++) {
536 auto desc = OH_AI_GetElementOfNNRTDeviceDescs(descs, i);
537 auto name = OH_AI_GetNameFromNNRTDeviceDesc(desc);
538 if (strncmp(name, "NPU_", 4) == 0 ) { // npu推理,在线编译
539 desc_nnrt = desc;
540 break;
541 }
542 }
543 if (desc_nnrt == nullptr) {
544 BENCHMARK_LOG_ERROR("nnrt desc get failed");
545 return RET_ERROR;
546 }
547 auto id = OH_AI_GetDeviceIdFromNNRTDeviceDesc(desc_nnrt);
548 nnrt_device_info->SetDeviceID(id);
549 nnrt_device_info->SetPerformanceMode(flags_->nnrt_performance_mode_);
550 OH_AI_DestroyAllNNRTDeviceDescs(&descs);
551 device_list.push_back(nnrt_device_info);
552 }
553
554 // CPU priority is behind GPU and NPU
555 std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
556 device_info->SetEnableFP16(flags_->enable_fp16_);
557 device_info->SetProvider(flags_->provider_);
558 device_list.push_back(device_info);
559
560 return RET_OK;
561 }
562 #ifdef PARALLEL_INFERENCE
CompareOutputForModelPool(std::vector<mindspore::MSTensor> * outputs)563 int BenchmarkUnifiedApi::CompareOutputForModelPool(std::vector<mindspore::MSTensor> *outputs) {
564 if (outputs->empty()) {
565 MS_LOG(ERROR) << "outputs is empty.";
566 return RET_ERROR;
567 }
568 std::cout << "================ Comparing Output data ================" << std::endl;
569 float total_bias = 0;
570 int total_size = 0;
571 // check the output tensor name.
572 for (size_t i = 0; i < outputs->size(); i++) {
573 std::string tensor_name = outputs->at(i).Name();
574 mindspore::MSTensor tensor = outputs->at(i);
575 if (tensor == nullptr) {
576 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
577 return RET_ERROR;
578 }
579 constexpr float kParallelRelative = 1e-7;
580 constexpr float kParallelAbsolute = 1e-10;
581 int ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size, kParallelRelative,
582 kParallelAbsolute);
583 if (ret != RET_OK) {
584 MS_LOG(ERROR) << "Error in CompareData";
585 std::cerr << "Error in CompareData" << std::endl;
586 std::cout << "=======================================================" << std::endl << std::endl;
587 return ret;
588 }
589 }
590 float mean_bias;
591 if (total_size != 0) {
592 mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
593 } else {
594 mean_bias = 0;
595 }
596
597 std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
598 std::cout << "=======================================================" << std::endl << std::endl;
599
600 if (mean_bias > this->flags_->accuracy_threshold_) {
601 MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
602 std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
603 return RET_ERROR;
604 }
605 return RET_OK;
606 }
607 #endif
608
Convert2Float32(float * __restrict out,const uint16_t in)609 void Convert2Float32(float *__restrict out, const uint16_t in) {
610 uint32_t t1;
611 uint32_t t2;
612 uint32_t t3;
613
614 t1 = in & 0x7fffu;
615 t2 = in & 0x8000u;
616 t3 = in & 0x7c00u;
617
618 t1 <<= 13u;
619 t2 <<= 16u;
620
621 t1 += 0x38000000;
622
623 t1 = (t3 == 0 ? 0 : t1);
624
625 t1 |= t2;
626
627 *(out) = static_cast<float>(t1);
628 }
629
630 namespace {
631 template <typename T>
VectorValueCompare(const std::vector<T> & vec1,const std::vector<T> & vec2)632 bool VectorValueCompare(const std::vector<T> &vec1, const std::vector<T> &vec2) {
633 if (vec1.size() != vec2.size()) {
634 return false;
635 }
636 for (auto &ele : vec1) {
637 if (!IsContain(vec2, ele)) {
638 return false;
639 }
640 }
641 return true;
642 }
643 } // namespace
644
CompareOutput()645 int BenchmarkUnifiedApi::CompareOutput() {
646 std::cout << "================ Comparing Output data ================" << std::endl;
647 float total_bias = 0;
648 int total_size = 0;
649 // check the output tensor name.
650 if (!VectorValueCompare(this->benchmark_tensor_names_, ms_model_.GetOutputTensorNames())) {
651 MS_LOG(ERROR) << "The output tensor name is wrong.";
652 return RET_ERROR;
653 }
654 for (const auto &calib_tensor : benchmark_data_) {
655 std::string tensor_name = calib_tensor.first;
656 mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
657 if (tensor == nullptr) {
658 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
659 return RET_ERROR;
660 }
661 int ret;
662 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
663 std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
664 ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
665 } else {
666 if (flags_->enable_gl_texture_) {
667 auto *gltexture_id = reinterpret_cast<GLuint *>(tensor.MutableData());
668 if (gltexture_id == nullptr) {
669 MS_LOG(ERROR) << "get gltexture_id failed";
670 return RET_ERROR;
671 }
672 auto tmp = gl_runtime_.CopyDeviceTextureToHost(*gltexture_id);
673 if (tmp == nullptr) {
674 MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
675 return RET_ERROR;
676 }
677 float *hostptr = reinterpret_cast<float *>(tmp);
678
679 auto tensor_shape = tensor.Shape();
680 auto data_len =
681 std::accumulate(tensor_shape.begin(), tensor_shape.end(), sizeof(float), std::multiplies<size_t>());
682 auto *new_tensor = new (std::nothrow)
683 MSTensor(tensor_name, mindspore::DataType::kNumberTypeFloat32, tensor_shape, hostptr, data_len);
684 MS_CHECK_TRUE_MSG(new_tensor != nullptr, RET_ERROR, "new tensor failed");
685 if (new_tensor->MutableData() == nullptr) {
686 MS_LOG(ERROR) << "CopyDeviceTextureToHost failed";
687 delete new_tensor;
688 return RET_ERROR;
689 }
690 ret = CompareDataGetTotalBiasAndSize(tensor_name, new_tensor, &total_bias, &total_size);
691 delete new_tensor;
692 } else {
693 ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
694 }
695 }
696 if (ret != RET_OK) {
697 MS_LOG(ERROR) << "Error in CompareData";
698 std::cerr << "Error in CompareData" << std::endl;
699 std::cout << "=======================================================" << std::endl << std::endl;
700 return ret;
701 }
702 }
703 float mean_bias;
704 if (total_size != 0) {
705 mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
706 } else {
707 mean_bias = 0;
708 }
709
710 std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
711 std::cout << "=======================================================" << std::endl << std::endl;
712
713 if (mean_bias > this->flags_->accuracy_threshold_) {
714 MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
715 std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
716 return RET_ERROR;
717 }
718 return RET_OK;
719 }
720
CompareOutputByCosineDistance(float cosine_distance_threshold)721 int BenchmarkUnifiedApi::CompareOutputByCosineDistance(float cosine_distance_threshold) {
722 std::cout << "================ Comparing Output data ================" << std::endl;
723 float total_cosine_distance = 0;
724 int total_size = 0;
725 // check the output tensor name.
726 if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
727 MS_LOG(ERROR) << "The output tensor name is wrong.";
728 return RET_ERROR;
729 }
730 for (const auto &calib_tensor : benchmark_data_) {
731 std::string tensor_name = calib_tensor.first;
732 mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
733 if (tensor == nullptr) {
734 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
735 return RET_ERROR;
736 }
737 int ret;
738 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
739 std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
740 ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
741 } else {
742 ret = CompareDataGetTotalCosineDistanceAndSize(tensor_name, &tensor, &total_cosine_distance, &total_size);
743 }
744 if (ret != RET_OK) {
745 MS_LOG(ERROR) << "Error in CompareData";
746 std::cerr << "Error in CompareData" << std::endl;
747 std::cout << "=======================================================" << std::endl << std::endl;
748 return ret;
749 }
750 }
751 float mean_cosine_distance;
752 if (total_size != 0) {
753 mean_cosine_distance = total_cosine_distance / float_t(total_size);
754 } else {
755 mean_cosine_distance = CosineErrMaxVal;
756 }
757 mean_cosine_distance = 1 - mean_cosine_distance;
758 std::cout << "Cosine distance of all nodes/tensors: " << std::setprecision(std::numeric_limits<double>::digits10)
759 << mean_cosine_distance << std::endl;
760 std::cout << "=======================================================" << std::endl << std::endl;
761
762 if (mean_cosine_distance < cosine_distance_threshold) {
763 MS_LOG(ERROR) << "cosine distance of all nodes/tensors is too small: " << mean_cosine_distance;
764 std::cerr << "Mean cosine distance of all nodes/tensors is too small: " << mean_cosine_distance << std::endl;
765 return RET_ERROR;
766 }
767 return RET_OK;
768 }
769
CompareDataGetTotalBiasAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_bias,int * total_size,float relative_tolerance,float absolute_tolerance)770 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
771 float *total_bias, int *total_size, float relative_tolerance,
772 float absolute_tolerance) {
773 float bias = 0;
774 auto mutableData = tensor->MutableData();
775 if (mutableData == nullptr) {
776 MS_LOG(ERROR) << "mutableData is nullptr.";
777 return RET_ERROR;
778 }
779 switch (static_cast<int>(tensor->DataType())) {
780 case TypeId::kNumberTypeFloat:
781 case TypeId::kNumberTypeFloat32: {
782 bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
783 break;
784 }
785 case TypeId::kNumberTypeInt8: {
786 bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
787 break;
788 }
789 case TypeId::kNumberTypeUInt8: {
790 bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
791 break;
792 }
793 case TypeId::kNumberTypeInt32: {
794 bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
795 break;
796 }
797 case TypeId::kNumberTypeInt16: {
798 bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
799 break;
800 }
801 case TypeId::kNumberTypeBool: {
802 bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData, relative_tolerance, absolute_tolerance);
803 break;
804 }
805 case TypeId::kNumberTypeFloat16: {
806 size_t shapeSize = 1;
807 for (int64_t dim : tensor->Shape()) {
808 if (dim <= 0) {
809 MS_LOG(ERROR) << "The shape of output " << name << " should be great than 0 after inference, got "
810 << tensor->Shape();
811 return RET_ERROR;
812 }
813 MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
814 shapeSize *= static_cast<size_t>(dim);
815 }
816 auto *floatArr = new float[shapeSize];
817 for (size_t i = 0; i < shapeSize; ++i) {
818 uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
819 floatArr[i] = ShortToFloat32(tmpInt);
820 }
821 bias = CompareData<float, int64_t>(name, tensor->Shape(), floatArr);
822 delete[] floatArr;
823 break;
824 }
825 default:
826 MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
827 return RET_ERROR;
828 }
829 if (bias < 0) {
830 MS_LOG(ERROR) << "CompareData failed, name: " << name;
831 return RET_ERROR;
832 }
833 *total_bias += bias;
834 *total_size += 1;
835 return RET_OK;
836 }
CompareDataGetTotalCosineDistanceAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_cosine_distance,int * total_size)837 int BenchmarkUnifiedApi::CompareDataGetTotalCosineDistanceAndSize(const std::string &name, mindspore::MSTensor *tensor,
838 float *total_cosine_distance, int *total_size) {
839 if (tensor == nullptr) {
840 MS_LOG(ERROR) << "tensor is nullptr.";
841 return RET_ERROR;
842 }
843 if (total_cosine_distance == nullptr) {
844 MS_LOG(ERROR) << "total_cosine_distance is nullptr.";
845 return RET_ERROR;
846 }
847 if (total_size == nullptr) {
848 MS_LOG(ERROR) << "total_size is nullptr.";
849 return RET_ERROR;
850 }
851 float bias = 0;
852 auto mutableData = tensor->MutableData();
853 if (mutableData == nullptr) {
854 MS_LOG(ERROR) << "mutableData is nullptr.";
855 return RET_ERROR;
856 }
857 int res = RET_OK;
858 switch (static_cast<int>(tensor->DataType())) {
859 case TypeId::kNumberTypeFloat:
860 case TypeId::kNumberTypeFloat32: {
861 res = CompareDatabyCosineDistance<float>(name, tensor->Shape(), mutableData, &bias);
862 break;
863 }
864 case TypeId::kNumberTypeFloat16: {
865 size_t shapeSize = 1;
866 for (int64_t dim : tensor->Shape()) {
867 if (dim <= 0) {
868 MS_LOG(ERROR) << "Invalid shape.";
869 return RET_ERROR;
870 }
871 MS_CHECK_FALSE_MSG(SIZE_MUL_OVERFLOW(shapeSize, static_cast<size_t>(dim)), RET_ERROR, "mul overflow");
872 shapeSize *= static_cast<size_t>(dim);
873 }
874 float *floatArr = new float[shapeSize];
875 for (size_t i = 0; i < shapeSize; ++i) {
876 uint16_t tmpInt = reinterpret_cast<uint16_t *>(mutableData)[i];
877 Convert2Float32(&floatArr[i], tmpInt);
878 reinterpret_cast<float *>(mutableData)[i] = floatArr[i];
879 }
880 delete[] floatArr;
881 bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
882 break;
883 }
884 case TypeId::kNumberTypeInt8: {
885 res = CompareDatabyCosineDistance<int8_t>(name, tensor->Shape(), mutableData, &bias);
886 break;
887 }
888 case TypeId::kNumberTypeUInt8: {
889 res = CompareDatabyCosineDistance<uint8_t>(name, tensor->Shape(), mutableData, &bias);
890 break;
891 }
892 case TypeId::kNumberTypeInt32: {
893 res = CompareDatabyCosineDistance<int32_t>(name, tensor->Shape(), mutableData, &bias);
894 break;
895 }
896 case TypeId::kNumberTypeInt16: {
897 res = CompareDatabyCosineDistance<int16_t>(name, tensor->Shape(), mutableData, &bias);
898 break;
899 }
900 case TypeId::kNumberTypeBool: {
901 res = CompareDatabyCosineDistance<bool>(name, tensor->Shape(), mutableData, &bias);
902 break;
903 }
904 default:
905 MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
906 return RET_ERROR;
907 }
908 if (res != RET_OK) {
909 MS_LOG(ERROR) << "CompareData failed, name: " << name;
910 return RET_ERROR;
911 }
912 *total_cosine_distance += 1 - bias;
913 *total_size += 1;
914 return RET_OK;
915 }
916
MarkPerformance()917 int BenchmarkUnifiedApi::MarkPerformance() {
918 MS_LOG(INFO) << "Running warm up loops...";
919 std::cout << "Running warm up loops..." << std::endl;
920 std::vector<MSTensor> outputs;
921 for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
922 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
923 if (status != kSuccess) {
924 MS_LOG(ERROR) << "Inference error ";
925 std::cerr << "Inference error " << std::endl;
926 return RET_ERROR;
927 }
928 }
929
930 MS_LOG(INFO) << "Running benchmark loops...";
931 std::cout << "Running benchmark loops..." << std::endl;
932 uint64_t time_min = UINT64_MAX;
933 uint64_t time_max = 0;
934 uint64_t time_avg = 0;
935
936 for (int i = 0; i < flags_->loop_count_; i++) {
937 auto inputs = ms_model_.GetInputs();
938 for (auto tensor : inputs) {
939 tensor.MutableData(); // prepare data
940 }
941 auto start = GetTimeUs();
942 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
943 if (status != kSuccess) {
944 MS_LOG(ERROR) << "Inference error ";
945 std::cerr << "Inference error ";
946 return RET_ERROR;
947 }
948
949 auto end = GetTimeUs();
950 auto time = end - start;
951 time_min = std::min(time_min, time);
952 time_max = std::max(time_max, time);
953 time_avg += time;
954 }
955
956 if (flags_->time_profiling_) {
957 const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
958 const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
959 (void)PrintResult(per_op_name, op_times_by_name_);
960 (void)PrintResult(per_op_type, op_times_by_type_);
961 #ifdef ENABLE_ARM64
962 } else if (flags_->perf_profiling_) {
963 if (flags_->perf_event_ == "CACHE") {
964 const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
965 const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
966 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
967 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
968 } else if (flags_->perf_event_ == "STALL") {
969 const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
970 "backendend(%)"};
971 const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
972 "backendend(%)"};
973 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
974 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
975 } else {
976 const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
977 const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
978 (void)PrintPerfResult(per_op_name, op_perf_by_name_);
979 (void)PrintPerfResult(per_op_type, op_perf_by_type_);
980 }
981 #endif
982 }
983
984 if (flags_->loop_count_ > 0) {
985 time_avg /= static_cast<size_t>(flags_->loop_count_);
986 MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
987 << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
988 << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
989 printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
990 flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
991 time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
992 }
993 return RET_OK;
994 }
995
MarkAccuracy()996 int BenchmarkUnifiedApi::MarkAccuracy() {
997 MS_LOG(INFO) << "MarkAccuracy";
998 std::cout << "MarkAccuracy" << std::endl;
999
1000 int status = 0;
1001 if (flags_->enable_gl_texture_) {
1002 for (auto in_tensor : ms_inputs_for_api_) {
1003 auto *input = reinterpret_cast<GLuint *>(in_tensor.MutableData());
1004 if (input == nullptr) {
1005 MS_LOG(ERROR) << "get input data failed";
1006 return RET_ERROR;
1007 }
1008 float *hostptr = reinterpret_cast<float *>(gl_runtime_.CopyDeviceTextureToHost(*input));
1009 size_t print_num = 20;
1010 gl_runtime_.PrintImage2DData(hostptr, 1, 1, print_num);
1011 }
1012 } else {
1013 status = PrintInputData();
1014 if (status != RET_OK) {
1015 MS_LOG(ERROR) << "PrintInputData error " << status;
1016 std::cerr << "PrintInputData error " << status << std::endl;
1017 return status;
1018 }
1019 }
1020 std::vector<MSTensor> outputs;
1021 auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
1022 if (ret != kSuccess) {
1023 MS_LOG(ERROR) << "Inference error ";
1024 std::cerr << "Inference error " << std::endl;
1025 return RET_ERROR;
1026 }
1027 status = ReadCalibData();
1028 if (status != RET_OK) {
1029 MS_LOG(ERROR) << "Read calib data error " << status;
1030 std::cerr << "Read calib data error " << status << std::endl;
1031 return status;
1032 }
1033 status = CompareOutput();
1034 if (status != RET_OK) {
1035 MS_LOG(ERROR) << "Compare output error " << status;
1036 std::cerr << "Compare output error " << status << std::endl;
1037 return status;
1038 }
1039 if (this->flags_->cosine_distance_threshold_ >= -1) {
1040 status = CompareOutputByCosineDistance(this->flags_->cosine_distance_threshold_);
1041 if (status != RET_OK) {
1042 MS_LOG(ERROR) << "Compare output error by consine distance " << status;
1043 std::cerr << "Compare output error by consine distance" << status << std::endl;
1044 return status;
1045 }
1046 }
1047 return RET_OK;
1048 }
1049
PrintInputData()1050 int BenchmarkUnifiedApi::PrintInputData() {
1051 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1052 mindspore::MSTensor input = ms_inputs_for_api_[i];
1053 auto tensor_data_type = static_cast<int>(input.DataType());
1054
1055 std::cout << "InData " << i << ": ";
1056 if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1057 MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1058 continue;
1059 }
1060 if (tensor_data_type == TypeId::kObjectTypeString) {
1061 std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1062 size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1063 for (size_t j = 0; j < print_num; j++) {
1064 std::cout << output_strings[j] << std::endl;
1065 }
1066 continue;
1067 }
1068 size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1069 const void *in_data = input.MutableData();
1070 if (in_data == nullptr) {
1071 MS_LOG(ERROR) << "in_data is nullptr.";
1072 return RET_ERROR;
1073 }
1074
1075 for (size_t j = 0; j < print_num; j++) {
1076 if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1077 std::cout << static_cast<const float *>(in_data)[j] << " ";
1078 } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1079 std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1080 } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1081 std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1082 } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1083 std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1084 } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1085 std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1086 } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1087 std::cout << static_cast<const bool *>(in_data)[j] << " ";
1088 } else {
1089 MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1090 return RET_ERROR;
1091 }
1092 }
1093 std::cout << std::endl;
1094 }
1095 return RET_OK;
1096 }
1097 #ifdef PARALLEL_INFERENCE
ModelParallelRunnerWarmUp(int index)1098 void BenchmarkUnifiedApi::ModelParallelRunnerWarmUp(int index) {
1099 auto in = model_runner_.GetInputs();
1100 for (size_t i = 0; i < in.size(); i++) {
1101 in[i].SetShape(resize_dims_[i]);
1102 in[i].SetData(all_inputs_data_[index][i], false);
1103 }
1104 auto warm_up_start = GetTimeUs();
1105 std::vector<MSTensor> output;
1106 auto ret = model_runner_.Predict(in, &output);
1107 for (size_t j = 0; j < in.size(); j++) {
1108 in[j].SetData(nullptr);
1109 }
1110 if (ret != kSuccess) {
1111 model_parallel_runner_ret_failed_ = true;
1112 MS_LOG(ERROR) << "model pool predict failed.";
1113 return;
1114 }
1115 auto warm_up_end = GetTimeUs();
1116 std::cout << "warm up index: " << index << " | time: " << (warm_up_end - warm_up_start) / kFloatMSEC << " ms\n";
1117 }
1118
ModelParallelRunnerRun(int task_num,int parallel_idx)1119 void BenchmarkUnifiedApi::ModelParallelRunnerRun(int task_num, int parallel_idx) {
1120 for (int i = 0; i < task_num || task_num == -1; i++) {
1121 while (!runner_run_start_) {
1122 continue;
1123 }
1124 int idx = parallel_idx + flags_->warm_up_loop_count_;
1125 auto in = model_runner_.GetInputs();
1126 if (idx >= static_cast<int>(all_inputs_data_.size())) {
1127 MS_LOG(ERROR) << "idx is to big :" << idx;
1128 return;
1129 }
1130 auto in_data = all_inputs_data_[idx];
1131 for (size_t tensor_index = 0; tensor_index < in.size(); tensor_index++) {
1132 in.at(tensor_index).SetShape(resize_dims_.at(tensor_index));
1133 in.at(tensor_index).SetData(all_inputs_data_.at(idx)[tensor_index], false);
1134 }
1135 auto predict_start = GetTimeUs();
1136 std::vector<MSTensor> output;
1137 auto ret = model_runner_.Predict(in, &output);
1138 if (ret != kSuccess) {
1139 model_parallel_runner_ret_failed_ = true;
1140 MS_LOG(ERROR) << "model pool predict failed.";
1141 for (auto &item : in) {
1142 item.SetData(nullptr);
1143 }
1144 return;
1145 }
1146 auto predict_end = GetTimeUs();
1147 std::cout << "parallel index: " << parallel_idx << " | task index: " << i
1148 << " | predict time: " << (predict_end - predict_start) / kFloatMSEC << " ms\n";
1149 for (size_t j = 0; j < in.size(); j++) {
1150 in[j].SetData(nullptr);
1151 }
1152 if (!flags_->benchmark_data_file_.empty()) {
1153 auto status = CompareOutputForModelPool(&output);
1154 if (status != RET_OK) {
1155 model_parallel_runner_ret_failed_ = true;
1156 MS_LOG(ERROR) << "Compare output error " << status;
1157 return;
1158 }
1159 }
1160 }
1161 }
1162
AddConfigInfo(const std::shared_ptr<RunnerConfig> & runner_config)1163 int BenchmarkUnifiedApi::AddConfigInfo(const std::shared_ptr<RunnerConfig> &runner_config) {
1164 if (!flags_->config_file_.empty()) {
1165 runner_config->SetConfigPath(flags_->config_file_);
1166 }
1167 std::map<std::string, std::string> config;
1168 if (flags_->enable_shared_thread_pool_) {
1169 config[kEnableSharedThreadPoolKey] = "true";
1170 if (!flags_->thread_num_limit_per_worker_.empty()) {
1171 config[kThreadNumLimitPerWorkerKey] = flags_->thread_num_limit_per_worker_;
1172 }
1173 if (!flags_->thread_num_remaining_per_worker_.empty()) {
1174 config[kThreadNumRemainingPerWorkerKey] = flags_->thread_num_remaining_per_worker_;
1175 }
1176 } else {
1177 config[kEnableSharedThreadPoolKey] = "false";
1178 }
1179 runner_config->SetConfigInfo(kSharedThreadPoolSection, config);
1180 return RET_OK;
1181 }
1182
ParallelInference(std::shared_ptr<mindspore::Context> context)1183 int BenchmarkUnifiedApi::ParallelInference(std::shared_ptr<mindspore::Context> context) {
1184 if (flags_->warm_up_loop_count_ > kMaxRequestNum || flags_->parallel_num_ > kMaxRequestNum) {
1185 MS_LOG(WARNING) << "in parallel predict warm up loop count should less than" << kMaxRequestNum;
1186 }
1187
1188 // model runner init
1189 auto runner_config = std::make_shared<RunnerConfig>();
1190 runner_config->SetContext(context);
1191 runner_config->SetWorkersNum(flags_->workers_num_);
1192 auto status = AddConfigInfo(runner_config);
1193 MS_CHECK_FALSE_MSG(status != kSuccess, RET_ERROR, "add config info for parallel predict failed.");
1194 auto model_init_start = GetTimeUs();
1195 auto ret = model_runner_.Init(flags_->model_file_, runner_config);
1196 MS_CHECK_FALSE_MSG(ret != kSuccess, RET_ERROR, "model pool init failed.");
1197 auto model_init_end = GetTimeUs();
1198
1199 // load data
1200 ms_inputs_for_api_ = model_runner_.GetInputs();
1201 MS_CHECK_FALSE_MSG(ms_inputs_for_api_.empty(), RET_ERROR, "model pool input is empty.");
1202 ms_outputs_for_api_ = model_runner_.GetOutputs();
1203 MS_CHECK_FALSE_MSG(ms_outputs_for_api_.empty(), RET_ERROR, "model pool output is empty.");
1204
1205 if (!flags_->graph_input_shape_map_.empty()) {
1206 // parse model input shapes from --inputShape flag
1207 std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(model_runner_.GetInputs());
1208 MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize dims empty.");
1209 (void)std::transform(resize_dims.begin(), resize_dims.end(), std::back_inserter(resize_dims_),
1210 [&](const auto &shapes) { return shapes; });
1211 } else {
1212 (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims_),
1213 [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1214 }
1215
1216 for (int i = 0; i < flags_->parallel_num_ + flags_->warm_up_loop_count_; i++) {
1217 status = LoadInput();
1218 MS_CHECK_FALSE_MSG(status != RET_OK, status, "Generate input data error");
1219 std::vector<MSTensor> output;
1220 all_outputs_.push_back(output);
1221 }
1222 if (!flags_->benchmark_data_file_.empty()) {
1223 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
1224 auto &tensor = ms_inputs_for_api_[i];
1225 tensor.SetShape(resize_dims_[i]);
1226 tensor.SetData(all_inputs_data_[0][i], false);
1227 }
1228 status = PrintInputData();
1229 MS_CHECK_FALSE_MSG(status != RET_OK, status, "PrintInputData error ");
1230 status = ReadCalibData();
1231 MS_CHECK_FALSE_MSG(status != RET_OK, status, "ReadCalibData error ");
1232 }
1233
1234 // warm up
1235 std::vector<std::thread> model_thread_warm_up;
1236 for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
1237 model_thread_warm_up.push_back(std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerWarmUp, this, i));
1238 }
1239 for (auto &warm_up_thread : model_thread_warm_up) {
1240 warm_up_thread.join();
1241 }
1242 if (model_parallel_runner_ret_failed_) {
1243 return RET_ERROR;
1244 }
1245 std::cout << "=============== end warm up ===============\n";
1246 // do loop count
1247 std::vector<std::thread> model_thread_run;
1248 for (int parallel_num_idx = 0; parallel_num_idx < flags_->parallel_num_; parallel_num_idx++) {
1249 model_thread_run.push_back(
1250 std::thread(&BenchmarkUnifiedApi::ModelParallelRunnerRun, this, flags_->parallel_task_num_, parallel_num_idx));
1251 }
1252 auto start_run_time = lite::GetTimeUs();
1253 runner_run_start_ = true;
1254 for (auto &run_thread : model_thread_run) {
1255 run_thread.join();
1256 }
1257 auto end_run_time = lite::GetTimeUs();
1258 if (model_parallel_runner_ret_failed_) {
1259 return RET_ERROR;
1260 }
1261 std::cout << "=================================" << std::endl;
1262 std::cout << "parallel predict init time: " << (model_init_end - model_init_start) / kFloatMSEC << " ms\n";
1263 std::cout << "parallel predict all run time: " << (end_run_time - start_run_time) / kFloatMSEC << " ms\n";
1264 std::cout << "=================================" << std::endl;
1265 return RET_OK;
1266 }
1267 #endif
1268
PrintOutputData()1269 int BenchmarkUnifiedApi::PrintOutputData() {
1270 for (size_t i = 0; i < ms_outputs_for_api_.size(); i++) {
1271 mindspore::MSTensor input = ms_outputs_for_api_[i];
1272 auto tensor_data_type = static_cast<int>(input.DataType());
1273
1274 std::cout << "OutData " << i << ": ";
1275 if (tensor_data_type == TypeId::kNumberTypeFloat16) {
1276 MS_LOG(INFO) << "DataType: " << TypeId::kNumberTypeFloat16;
1277 continue;
1278 }
1279 if (tensor_data_type == TypeId::kObjectTypeString) {
1280 std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
1281 size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
1282 for (size_t j = 0; j < print_num; j++) {
1283 std::cout << output_strings[j] << std::endl;
1284 }
1285 continue;
1286 }
1287 size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
1288 const void *in_data = input.MutableData();
1289 if (in_data == nullptr) {
1290 MS_LOG(ERROR) << "out_data is nullptr.";
1291 return RET_ERROR;
1292 }
1293
1294 for (size_t j = 0; j < print_num; j++) {
1295 if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
1296 std::cout << static_cast<const float *>(in_data)[j] << " ";
1297 } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
1298 std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
1299 } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
1300 std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
1301 } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
1302 std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
1303 } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
1304 std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
1305 } else if (tensor_data_type == TypeId::kNumberTypeBool) {
1306 std::cout << static_cast<const bool *>(in_data)[j] << " ";
1307 } else {
1308 MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
1309 return RET_ERROR;
1310 }
1311 }
1312 std::cout << std::endl;
1313 }
1314 return RET_OK;
1315 }
1316
CompileGraph(mindspore::ModelType model_type,const std::shared_ptr<Context> & context,const std::string & model_name)1317 int BenchmarkUnifiedApi::CompileGraph(mindspore::ModelType model_type, const std::shared_ptr<Context> &context,
1318 const std::string &model_name) {
1319 Key dec_key;
1320 if (!flags_->decrypt_key_str_.empty()) {
1321 dec_key.len = lite::Hex2ByteArray(flags_->decrypt_key_str_, dec_key.key, kEncMaxLen);
1322 if (dec_key.len == 0) {
1323 MS_LOG(ERROR) << "dec_key.len == 0";
1324 return RET_INPUT_PARAM_INVALID;
1325 }
1326 flags_->decrypt_key_str_.clear();
1327 }
1328 Status ret;
1329 if (flags_->crypto_lib_path_.empty()) {
1330 ret = ms_model_.Build(flags_->model_file_, model_type, context);
1331 } else {
1332 ret =
1333 ms_model_.Build(flags_->model_file_, model_type, context, dec_key, flags_->dec_mode_, flags_->crypto_lib_path_);
1334 }
1335 memset(dec_key.key, 0, kEncMaxLen);
1336 if (ret != kSuccess) {
1337 MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
1338 std::cout << "ms_model_.Build failed while running ", model_name.c_str();
1339 return RET_ERROR;
1340 }
1341 return RET_OK;
1342 }
1343
ParseGraphInputShapeMap(const std::vector<MSTensor> & inputs)1344 std::vector<std::vector<int64_t>> BenchmarkUnifiedApi::ParseGraphInputShapeMap(const std::vector<MSTensor> &inputs) {
1345 std::vector<std::vector<int64_t>> resize_dims;
1346 if (flags_->graph_input_shape_map_.size() != inputs.size()) {
1347 MS_LOG(ERROR) << "The number of inputs in the model does not match the parsed inputShape option. The model has ["
1348 << inputs.size() << "] input(s), while the parsed inputShape has ["
1349 << flags_->graph_input_shape_map_.size() << "] input(s).";
1350 return resize_dims;
1351 }
1352 for (auto &model_input : inputs) {
1353 if (flags_->graph_input_shape_map_.find(model_input.Name()) == flags_->graph_input_shape_map_.end()) {
1354 MS_LOG(ERROR) << "model input [" << model_input.Name()
1355 << "] is not found in inputShape option, please double check";
1356 MS_LOG(ERROR) << "model input names are as follows:";
1357 for (auto &mod_input : inputs) {
1358 MS_LOG(ERROR) << mod_input.Name();
1359 }
1360 MS_LOG(ERROR) << "user input names are as follows:";
1361 for (auto &user_input : flags_->graph_input_shape_map_) {
1362 MS_LOG(ERROR) << user_input.first;
1363 }
1364 return resize_dims;
1365 } else {
1366 auto shapes = flags_->graph_input_shape_map_[model_input.Name()];
1367 resize_dims.push_back(this->ConverterToInt64Vector(shapes));
1368 }
1369 }
1370 return resize_dims;
1371 }
1372
1373 #ifdef PARALLEL_INFERENCE
RunParallelBenchmark(std::shared_ptr<mindspore::Context> context)1374 int BenchmarkUnifiedApi::RunParallelBenchmark(std::shared_ptr<mindspore::Context> context) {
1375 if (flags_->resize_dims_.empty() && flags_->graph_input_shape_map_.empty()) {
1376 MS_LOG(ERROR) << "model input shapes should be provided when using parallel predict, please specify --inputShape";
1377 return RET_ERROR;
1378 }
1379 auto status = ParallelInference(context);
1380 MS_CHECK_FALSE_MSG(status != RET_OK, RET_ERROR, "run model pool failed.");
1381 return RET_OK;
1382 }
1383 #endif
1384
RunBenchmark()1385 int BenchmarkUnifiedApi::RunBenchmark() {
1386 auto start_prepare_time = GetTimeUs();
1387
1388 if (flags_->enable_gl_texture_) {
1389 if (!gl_runtime_.Init()) {
1390 MS_LOG(ERROR) << "opengl runtime init failed ";
1391 std::cerr << "opengl runtime init failed ";
1392 return RET_ERROR;
1393 }
1394 }
1395
1396 // Load graph
1397 std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
1398 auto iter = ModelTypeMap.find(flags_->model_type_);
1399 if (iter == ModelTypeMap.end()) {
1400 MS_LOG(ERROR) << "model_type " << flags_->model_type_ << " is invalid.";
1401 std::cerr << "model_type " << flags_->model_type_ << " is invalid.";
1402 return RET_ERROR;
1403 }
1404 mindspore::ModelType model_type = iter->second;
1405
1406 MS_LOG(INFO) << "start unified benchmark run";
1407 std::cout << "start unified benchmark run" << std::endl;
1408
1409 auto context = std::make_shared<mindspore::Context>();
1410 if (context == nullptr) {
1411 MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
1412 std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
1413 return RET_ERROR;
1414 }
1415
1416 auto status = InitMSContext(context);
1417 if (status != RET_OK) {
1418 MS_LOG(ERROR) << "InitMSContext failed while running " << model_name.c_str();
1419 std::cout << "InitMSContext failed while running " << model_name.c_str();
1420 return RET_ERROR;
1421 }
1422
1423 (void)UpdateDistributionName(context, &flags_->model_file_);
1424 (void)UpdateDistributionName(context, &flags_->benchmark_data_file_);
1425 (void)UpdateDistributionName(context, &flags_->config_file_);
1426
1427 if (!flags_->config_file_.empty()) {
1428 auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
1429 if (config_ret != kSuccess) {
1430 MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1431 std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
1432 }
1433 }
1434
1435 UpdateConfigInfo();
1436 #ifdef PARALLEL_INFERENCE
1437 if (flags_->enable_parallel_predict_) {
1438 MS_CHECK_FALSE_MSG(RunParallelBenchmark(context) != RET_OK, RET_ERROR, "run model pool failed.");
1439 return RET_OK;
1440 }
1441 #endif
1442
1443 status = CompileGraph(model_type, context, model_name);
1444 MS_CHECK_FALSE_MSG(status != RET_OK, status, "Compile graph failed.");
1445 if (!flags_->graph_input_shape_map_.empty()) {
1446 std::vector<std::vector<int64_t>> resize_dims = ParseGraphInputShapeMap(ms_model_.GetInputs());
1447 MS_CHECK_FALSE_MSG(resize_dims.empty(), RET_ERROR, "resize_dims is empty");
1448 auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1449 if (ret != kSuccess) {
1450 MS_LOG(ERROR) << "Input tensor resize failed.";
1451 std::cout << "Input tensor resize failed.";
1452 return RET_ERROR;
1453 }
1454 } else if (!flags_->resize_dims_.empty()) {
1455 std::vector<std::vector<int64_t>> resize_dims;
1456 (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
1457 [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
1458
1459 auto ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
1460 if (ret != kSuccess) {
1461 MS_LOG(ERROR) << "Input tensor resize failed.";
1462 std::cout << "Input tensor resize failed.";
1463 return RET_ERROR;
1464 }
1465 }
1466
1467 ms_inputs_for_api_ = ms_model_.GetInputs();
1468 ms_outputs_for_api_ = ms_model_.GetOutputs();
1469 auto end_prepare_time = GetTimeUs();
1470 MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
1471 std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;
1472
1473 // Load input
1474 MS_LOG(INFO) << "start generate input data";
1475 status = LoadInput();
1476 if (status != RET_OK) {
1477 MS_LOG(ERROR) << "Generate input data error";
1478 return status;
1479 }
1480 return GetBenchmarkResult();
1481 }
1482
GetBenchmarkResult()1483 int BenchmarkUnifiedApi::GetBenchmarkResult() {
1484 if (!flags_->benchmark_data_file_.empty()) {
1485 auto status = MarkAccuracy();
1486 if (status != RET_OK) {
1487 MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
1488 std::cout << "Run MarkAccuracy error: " << status << std::endl;
1489 return status;
1490 }
1491 } else {
1492 auto status = MarkPerformance();
1493 if (status != RET_OK) {
1494 MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
1495 std::cout << "Run MarkPerformance error: " << status << std::endl;
1496 return status;
1497 }
1498 }
1499 if (flags_->dump_tensor_data_) {
1500 std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
1501 }
1502 Status finalize_ret = ms_model_.Finalize();
1503 if (finalize_ret == kSuccess) {
1504 MS_LOG(INFO) << "Benchmark finalize executed success.";
1505 }
1506 return RET_OK;
1507 }
1508
InitTimeProfilingCallbackParameter()1509 int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
1510 if (flags_->inter_op_parallel_num_ > 1) {
1511 // before callback
1512 ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1513 const std::vector<mindspore::MSTensor> &before_outputs,
1514 const MSCallBackParam &call_param) {
1515 if (before_inputs.empty()) {
1516 MS_LOG(INFO) << "The num of beforeInputs is empty";
1517 }
1518 if (before_outputs.empty()) {
1519 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1520 }
1521 {
1522 std::lock_guard<std::mutex> _l(op_times_mutex_);
1523 if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1524 op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1525 }
1526 if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1527 op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1528 }
1529 op_start_times_by_name_[call_param.node_name] = GetTimeUs();
1530 op_call_times_total_++;
1531 }
1532 return true;
1533 };
1534
1535 // after callback
1536 ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1537 const std::vector<mindspore::MSTensor> &after_outputs,
1538 const MSCallBackParam &call_param) {
1539 uint64_t opEnd = GetTimeUs();
1540
1541 if (after_inputs.empty()) {
1542 MS_LOG(INFO) << "The num of after inputs is empty";
1543 }
1544 if (after_outputs.empty()) {
1545 MS_LOG(INFO) << "The num of after outputs is empty";
1546 }
1547 {
1548 std::lock_guard<std::mutex> _l(op_times_mutex_);
1549 float cost = static_cast<float>(opEnd - op_start_times_by_name_[call_param.node_name]) / kFloatMSEC;
1550 if (flags_->device_ == "GPU") {
1551 cost = static_cast<float>(call_param.execute_time);
1552 }
1553 op_cost_total_ += cost;
1554 op_times_by_type_[call_param.node_type].first++;
1555 op_times_by_type_[call_param.node_type].second += cost;
1556 op_times_by_name_[call_param.node_name].first++;
1557 op_times_by_name_[call_param.node_name].second += cost;
1558 }
1559 return true;
1560 };
1561 } else {
1562 // before callback
1563 ms_before_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &before_inputs,
1564 const std::vector<mindspore::MSTensor> &before_outputs,
1565 const MSCallBackParam &call_param) {
1566 if (before_inputs.empty()) {
1567 MS_LOG(INFO) << "The num of beforeInputs is empty";
1568 }
1569 if (before_outputs.empty()) {
1570 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1571 }
1572 if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
1573 op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
1574 }
1575 if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
1576 op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
1577 }
1578
1579 op_call_times_total_++;
1580 op_begin_ = GetTimeUs();
1581 return true;
1582 };
1583
1584 // after callback
1585 ms_after_call_back_ = [&, this](const std::vector<mindspore::MSTensor> &after_inputs,
1586 const std::vector<mindspore::MSTensor> &after_outputs,
1587 const MSCallBackParam &call_param) {
1588 uint64_t opEnd = GetTimeUs();
1589
1590 if (after_inputs.empty()) {
1591 MS_LOG(INFO) << "The num of after inputs is empty";
1592 }
1593 if (after_outputs.empty()) {
1594 MS_LOG(INFO) << "The num of after outputs is empty";
1595 }
1596
1597 float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
1598 if (flags_->device_ == "GPU") {
1599 cost = static_cast<float>(call_param.execute_time);
1600 }
1601 op_cost_total_ += cost;
1602 op_times_by_type_[call_param.node_type].first++;
1603 op_times_by_type_[call_param.node_type].second += cost;
1604 op_times_by_name_[call_param.node_name].first++;
1605 op_times_by_name_[call_param.node_name].second += cost;
1606 return true;
1607 };
1608 }
1609 return RET_OK;
1610 }
1611
InitPerfProfilingCallbackParameter()1612 int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
1613 #ifndef ENABLE_ARM64
1614 MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
1615 return RET_ERROR;
1616 #else
1617 struct perf_event_attr pe, pe2;
1618 memset(&pe, 0, sizeof(struct perf_event_attr));
1619 memset(&pe2, 0, sizeof(struct perf_event_attr));
1620 pe.type = PERF_TYPE_HARDWARE;
1621 pe2.type = PERF_TYPE_HARDWARE;
1622 pe.size = sizeof(struct perf_event_attr);
1623 pe2.size = sizeof(struct perf_event_attr);
1624 pe.disabled = 1;
1625 pe2.disabled = 1;
1626 pe.exclude_kernel = 1; // don't count kernel
1627 pe2.exclude_kernel = 1; // don't count kernel
1628 pe.exclude_hv = 1; // don't count hypervisor
1629 pe2.exclude_hv = 1; // don't count hypervisor
1630 pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1631 pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
1632 if (flags_->perf_event_ == "CACHE") {
1633 pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
1634 pe2.config = PERF_COUNT_HW_CACHE_MISSES;
1635 } else if (flags_->perf_event_ == "STALL") {
1636 pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
1637 pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
1638 } else {
1639 pe.config = PERF_COUNT_HW_CPU_CYCLES;
1640 pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
1641 }
1642 perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
1643 if (perf_fd == -1) {
1644 MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
1645 return RET_ERROR;
1646 }
1647 perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
1648 if (perf_fd2 == -1) {
1649 MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
1650 return RET_ERROR;
1651 }
1652 struct PerfCount zero;
1653 zero.value[0] = 0;
1654 zero.value[1] = 0;
1655 // before callback
1656 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1657 const std::vector<mindspore::MSTensor> &before_outputs,
1658 const MSCallBackParam &call_param) {
1659 if (before_inputs.empty()) {
1660 MS_LOG(INFO) << "The num of beforeInputs is empty";
1661 }
1662 if (before_outputs.empty()) {
1663 MS_LOG(INFO) << "The num of beforeOutputs is empty";
1664 }
1665 if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
1666 op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
1667 }
1668 if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
1669 op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
1670 }
1671
1672 op_call_times_total_++;
1673 ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
1674 ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
1675 return true;
1676 };
1677
1678 // after callback
1679 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1680 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1681 struct PerfResult res;
1682 ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
1683 if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
1684 MS_LOG(ERROR) << "Failed to read perf_fd";
1685 return false;
1686 }
1687
1688 if (after_inputs.empty()) {
1689 MS_LOG(INFO) << "The num of after inputs is empty";
1690 }
1691 if (after_outputs.empty()) {
1692 MS_LOG(INFO) << "The num of after outputs is empty";
1693 }
1694 float cost1 = static_cast<float>(res.values[0].value);
1695 float cost2 = static_cast<float>(res.values[1].value);
1696 op_cost_total_ += cost1;
1697 op_cost2_total_ += cost2;
1698 op_perf_by_type_[call_param.node_type].first++;
1699 op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
1700 op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
1701 op_perf_by_name_[call_param.node_name].first++;
1702 op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
1703 op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
1704 return true;
1705 };
1706 #endif
1707 return RET_OK;
1708 }
1709
1710 namespace {
1711 template <typename T>
DataToString(void * data,size_t data_number)1712 std::string DataToString(void *data, size_t data_number) {
1713 if (data == nullptr) {
1714 return "Data of tensor is nullptr";
1715 }
1716 std::ostringstream oss;
1717 auto casted_data = static_cast<T *>(data);
1718 for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
1719 oss << " " << casted_data[i];
1720 }
1721 return oss.str();
1722 }
1723
DumpMSTensor(mindspore::MSTensor * tensor)1724 std::string DumpMSTensor(mindspore::MSTensor *tensor) {
1725 if (tensor == nullptr) {
1726 return "Tensor is nullptr";
1727 }
1728 std::ostringstream oss;
1729 oss << " DataType: " << static_cast<int>(tensor->DataType());
1730 oss << " Shape:";
1731 for (auto &dim : tensor->Shape()) {
1732 oss << " " << dim;
1733 }
1734 oss << std::endl << " Data:";
1735 switch (static_cast<int>(tensor->DataType())) {
1736 case kNumberTypeFloat32: {
1737 oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
1738 } break;
1739 case kNumberTypeFloat16: {
1740 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1741 } break;
1742 case kNumberTypeInt32: {
1743 oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
1744 } break;
1745 case kNumberTypeInt16: {
1746 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
1747 } break;
1748 case kNumberTypeInt8: {
1749 oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
1750 } break;
1751 default:
1752 oss << "Unsupported data type to print";
1753 break;
1754 }
1755 return oss.str();
1756 }
1757 #ifndef BENCHMARK_CLIP_JSON
GenerateOutputFileName(mindspore::MSTensor * tensor,const std::string & op_name,const std::string & file_type,const size_t & idx)1758 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
1759 const std::string &file_type, const size_t &idx) {
1760 std::string file_name = op_name;
1761 auto pos = file_name.find_first_of('/');
1762 while (pos != std::string::npos) {
1763 file_name.replace(pos, 1, ".");
1764 pos = file_name.find_first_of('/');
1765 }
1766 file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
1767 for (const auto &dim : tensor->Shape()) {
1768 file_name += std::to_string(dim) + "_";
1769 }
1770 if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
1771 file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
1772 }
1773 auto tensor_format = tensor->format();
1774 if (kTensorFormatMap.find(tensor_format) != kTensorFormatMap.end()) {
1775 file_name += "_" + kTensorFormatMap.at(tensor_format) + ".bin";
1776 } else {
1777 file_name += +".bin";
1778 }
1779
1780 return file_name;
1781 }
1782 #endif
1783 } // namespace
1784
InitPrintTensorDataCallbackParameter()1785 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
1786 // before callback
1787 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1788 const std::vector<mindspore::MSTensor> &before_outputs,
1789 const MSCallBackParam &call_param) { return true; };
1790
1791 // after callback
1792 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1793 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1794 std::cout << "================================================================" << std::endl;
1795 std::cout << call_param.node_name << " inputs : " << std::endl;
1796 for (auto ms_tensor : after_inputs) {
1797 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1798 }
1799 std::cout << "----------------------------------------------------------------" << std::endl;
1800 std::cout << call_param.node_name << " outputs : " << std::endl;
1801 for (auto ms_tensor : after_outputs) {
1802 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
1803 }
1804 std::cout << "================================================================" << std::endl;
1805 return true;
1806 };
1807 return RET_OK;
1808 }
InitDumpTensorDataCallbackParameter()1809 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
1810 #ifndef BENCHMARK_CLIP_JSON
1811 // before callback
1812 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
1813 const std::vector<mindspore::MSTensor> &before_outputs,
1814 const MSCallBackParam &call_param) {
1815 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1816 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1817 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1818 if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1819 if (input_output_mode == 0 || input_output_mode == 1) {
1820 for (size_t i = 0; i < before_inputs.size(); i++) {
1821 auto ms_tensor = before_inputs.at(i);
1822 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
1823 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1824 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
1825 MS_LOG(ERROR) << "write tensor data to file failed.";
1826 return false;
1827 }
1828 }
1829 }
1830 }
1831 return true;
1832 };
1833
1834 // after callback
1835 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
1836 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
1837 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
1838 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
1839 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
1840 if (dump_mode == kDumpInputsAndOutputs ||
1841 std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
1842 if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
1843 for (size_t i = 0; i < after_outputs.size(); i++) {
1844 auto ms_tensor = after_outputs.at(i);
1845 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
1846 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
1847 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
1848 MS_LOG(ERROR) << "write tensor data to file failed.";
1849 return false;
1850 }
1851 }
1852 }
1853 }
1854 return true;
1855 };
1856 #endif
1857 return RET_OK;
1858 }
1859
~BenchmarkUnifiedApi()1860 BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {
1861 #ifdef PARALLEL_INFERENCE
1862 if (!flags_->enable_parallel_predict_) {
1863 return;
1864 }
1865 for (auto tensor : ms_inputs_for_api_) {
1866 auto data = tensor.MutableData();
1867 if (data != nullptr) {
1868 tensor.SetData(nullptr);
1869 }
1870 }
1871 for (auto &input : all_inputs_data_) {
1872 for (auto &data : input) {
1873 if (data != nullptr) {
1874 auto buf = static_cast<char *>(data);
1875 delete[] buf;
1876 data = nullptr;
1877 }
1878 }
1879 }
1880 #endif
1881 }
1882 } // namespace lite
1883 } // namespace mindspore
1884