1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "tools/benchmark/benchmark_unified_api.h"
18 #include <algorithm>
19 #include <utility>
20 #include <functional>
21 #include "include/context.h"
22 #include "include/ms_tensor.h"
23 #include "include/version.h"
24 #include "schema/model_generated.h"
25 #include "src/common/common.h"
26 #include "src/tensor.h"
27 #ifdef ENABLE_ARM64
28 #include <linux/perf_event.h>
29 #include <sys/ioctl.h>
30 #include <asm/unistd.h>
31 #include <unistd.h>
32 #endif
33
34 namespace mindspore {
35 constexpr size_t kDataToStringMaxNum = 40;
36 constexpr int kPrintDataNum = 20;
37 constexpr int kFrequencyDefault = 3;
38 constexpr int kPercentageDivisor = 100;
39 #ifndef BENCHMARK_CLIP_JSON
40 constexpr int kDumpInputsAndOutputs = 0;
41 constexpr int kDumpOutputs = 2;
42 #endif
43
44 namespace lite {
GenerateInputData()45 int BenchmarkUnifiedApi::GenerateInputData() {
46 for (auto &tensor : ms_inputs_for_api_) {
47 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
48 MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
49 if (input == nullptr) {
50 std::cerr << "StringsToTensor failed" << std::endl;
51 MS_LOG(ERROR) << "StringsToTensor failed";
52 return RET_ERROR;
53 }
54 tensor = *input;
55 } else {
56 auto input_data = tensor.MutableData();
57 if (input_data == nullptr) {
58 MS_LOG(ERROR) << "MallocData for inTensor failed";
59 return RET_ERROR;
60 }
61 int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
62 if (status != RET_OK) {
63 std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
64 MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
65 return status;
66 }
67 }
68 }
69 return RET_OK;
70 }
71
ReadInputFile()72 int BenchmarkUnifiedApi::ReadInputFile() {
73 if (ms_inputs_for_api_.empty()) {
74 return RET_OK;
75 }
76
77 if (this->flags_->in_data_type_ == kImage) {
78 MS_LOG(ERROR) << "Not supported image input";
79 return RET_ERROR;
80 } else {
81 for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
82 auto &cur_tensor = ms_inputs_for_api_.at(i);
83 MS_ASSERT(cur_tensor != nullptr);
84 size_t size;
85 char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
86 if (bin_buf == nullptr) {
87 MS_LOG(ERROR) << "ReadFile return nullptr";
88 return RET_ERROR;
89 }
90 if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
91 std::string str(bin_buf, size);
92 MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
93 if (input == nullptr) {
94 std::cerr << "StringsToTensor failed" << std::endl;
95 MS_LOG(ERROR) << "StringsToTensor failed";
96 delete[] bin_buf;
97 return RET_ERROR;
98 }
99 cur_tensor = *input;
100 } else {
101 auto tensor_data_size = cur_tensor.DataSize();
102 if (size != tensor_data_size) {
103 std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
104 << std::endl;
105 MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
106 delete[] bin_buf;
107 return RET_ERROR;
108 }
109 auto input_data = cur_tensor.MutableData();
110 if (input_data == nullptr) {
111 MS_LOG(ERROR) << "input_data is nullptr.";
112 return RET_ERROR;
113 }
114 memcpy(input_data, bin_buf, tensor_data_size);
115 }
116 delete[] bin_buf;
117 }
118 }
119 return RET_OK;
120 }
121
GetDataTypeByTensorName(const std::string & tensor_name)122 int BenchmarkUnifiedApi::GetDataTypeByTensorName(const std::string &tensor_name) {
123 return static_cast<int>(ms_model_.GetOutputByTensorName(tensor_name).DataType());
124 }
125
InitMSContext(const std::shared_ptr<mindspore::Context> & context)126 void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
127 context->SetThreadNum(flags_->num_threads_);
128 context->SetEnableParallel(flags_->enable_parallel_);
129 context->SetThreadAffinity(flags_->cpu_bind_mode_);
130 auto &device_list = context->MutableDeviceInfo();
131
132 if (flags_->device_ == "GPU") {
133 std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
134 gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
135 device_list.push_back(gpu_device_info);
136 }
137
138 if (flags_->device_ == "NPU") {
139 std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
140 npu_device_info->SetFrequency(kFrequencyDefault);
141 device_list.push_back(npu_device_info);
142 }
143
144 if (flags_->device_ == "Ascend310") {
145 std::shared_ptr<Ascend310DeviceInfo> ascend310_device_info = std::make_shared<Ascend310DeviceInfo>();
146 ascend310_device_info->SetDeviceID(0);
147 device_list.push_back(ascend310_device_info);
148 }
149
150 if (flags_->device_ == "NNRT") {
151 std::shared_ptr<NNRTDeviceInfo> nnrt_device_info = std::make_shared<NNRTDeviceInfo>();
152 device_list.push_back(nnrt_device_info);
153 }
154
155 // CPU priority is behind GPU and NPU
156 std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
157 device_info->SetEnableFP16(flags_->enable_fp16_);
158 device_list.push_back(device_info);
159 }
160
CompareOutput()161 int BenchmarkUnifiedApi::CompareOutput() {
162 std::cout << "================ Comparing Output data ================" << std::endl;
163 float total_bias = 0;
164 int total_size = 0;
165 // check the output tensor name.
166 if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
167 MS_LOG(ERROR) << "The output tensor name is wrong.";
168 return RET_ERROR;
169 }
170 for (const auto &calib_tensor : benchmark_data_) {
171 std::string tensor_name = calib_tensor.first;
172 mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
173 if (tensor == nullptr) {
174 MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
175 return RET_ERROR;
176 }
177 int ret;
178 if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
179 std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
180 ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
181 } else {
182 ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
183 }
184 if (ret != RET_OK) {
185 MS_LOG(ERROR) << "Error in CompareData";
186 std::cerr << "Error in CompareData" << std::endl;
187 std::cout << "=======================================================" << std::endl << std::endl;
188 return ret;
189 }
190 }
191 float mean_bias;
192 if (total_size != 0) {
193 mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
194 } else {
195 mean_bias = 0;
196 }
197
198 std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
199 std::cout << "=======================================================" << std::endl << std::endl;
200
201 if (mean_bias > this->flags_->accuracy_threshold_) {
202 MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
203 std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
204 return RET_ERROR;
205 }
206 return RET_OK;
207 }
208
CompareDataGetTotalBiasAndSize(const std::string & name,mindspore::MSTensor * tensor,float * total_bias,int * total_size)209 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
210 float *total_bias, int *total_size) {
211 float bias = 0;
212 auto mutableData = tensor->MutableData();
213 if (mutableData == nullptr) {
214 MS_LOG(ERROR) << "mutableData is nullptr.";
215 return RET_ERROR;
216 }
217 switch (static_cast<int>(tensor->DataType())) {
218 case TypeId::kNumberTypeFloat:
219 case TypeId::kNumberTypeFloat32: {
220 bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
221 break;
222 }
223 case TypeId::kNumberTypeInt8: {
224 bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData);
225 break;
226 }
227 case TypeId::kNumberTypeUInt8: {
228 bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData);
229 break;
230 }
231 case TypeId::kNumberTypeInt32: {
232 bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData);
233 break;
234 }
235 case TypeId::kNumberTypeInt16: {
236 bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData);
237 break;
238 }
239 case TypeId::kNumberTypeBool: {
240 bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData);
241 break;
242 }
243 default:
244 MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
245 return RET_ERROR;
246 }
247 if (bias < 0) {
248 MS_LOG(ERROR) << "CompareData failed, name: " << name;
249 return RET_ERROR;
250 }
251 *total_bias += bias;
252 *total_size += 1;
253 return RET_OK;
254 }
255
MarkPerformance()256 int BenchmarkUnifiedApi::MarkPerformance() {
257 MS_LOG(INFO) << "Running warm up loops...";
258 std::cout << "Running warm up loops..." << std::endl;
259 std::vector<MSTensor> outputs;
260
261 for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
262 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
263 if (status != kSuccess) {
264 MS_LOG(ERROR) << "Inference error ";
265 std::cerr << "Inference error " << std::endl;
266 return RET_ERROR;
267 }
268 }
269
270 MS_LOG(INFO) << "Running benchmark loops...";
271 std::cout << "Running benchmark loops..." << std::endl;
272 uint64_t time_min = 1000000;
273 uint64_t time_max = 0;
274 uint64_t time_avg = 0;
275
276 for (int i = 0; i < flags_->loop_count_; i++) {
277 auto inputs = ms_model_.GetInputs();
278 for (auto tensor : inputs) {
279 tensor.MutableData(); // prepare data
280 }
281 auto start = GetTimeUs();
282 auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
283 if (status != kSuccess) {
284 MS_LOG(ERROR) << "Inference error ";
285 std::cerr << "Inference error ";
286 return RET_ERROR;
287 }
288
289 auto end = GetTimeUs();
290 auto time = end - start;
291 time_min = std::min(time_min, time);
292 time_max = std::max(time_max, time);
293 time_avg += time;
294 }
295
296 if (flags_->time_profiling_) {
297 const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
298 const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
299 PrintResult(per_op_name, op_times_by_name_);
300 PrintResult(per_op_type, op_times_by_type_);
301 #ifdef ENABLE_ARM64
302 } else if (flags_->perf_profiling_) {
303 if (flags_->perf_event_ == "CACHE") {
304 const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
305 const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
306 PrintPerfResult(per_op_name, op_perf_by_name_);
307 PrintPerfResult(per_op_type, op_perf_by_type_);
308 } else if (flags_->perf_event_ == "STALL") {
309 const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
310 "backendend(%)"};
311 const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
312 "backendend(%)"};
313 PrintPerfResult(per_op_name, op_perf_by_name_);
314 PrintPerfResult(per_op_type, op_perf_by_type_);
315 } else {
316 const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
317 const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
318 PrintPerfResult(per_op_name, op_perf_by_name_);
319 PrintPerfResult(per_op_type, op_perf_by_type_);
320 }
321 #endif
322 }
323
324 if (flags_->loop_count_ > 0) {
325 time_avg /= flags_->loop_count_;
326 MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
327 << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
328 << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
329 printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
330 flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
331 time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
332 }
333 return RET_OK;
334 }
335
MarkAccuracy()336 int BenchmarkUnifiedApi::MarkAccuracy() {
337 MS_LOG(INFO) << "MarkAccuracy";
338 std::cout << "MarkAccuracy" << std::endl;
339
340 auto status = PrintInputData();
341 if (status != RET_OK) {
342 MS_LOG(ERROR) << "PrintInputData error " << status;
343 std::cerr << "PrintInputData error " << status << std::endl;
344 return status;
345 }
346 std::vector<MSTensor> outputs;
347 auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
348 if (ret != kSuccess) {
349 MS_LOG(ERROR) << "Inference error ";
350 std::cerr << "Inference error " << std::endl;
351 return RET_ERROR;
352 }
353 status = ReadCalibData();
354 if (status != RET_OK) {
355 MS_LOG(ERROR) << "Read calib data error " << status;
356 std::cerr << "Read calib data error " << status << std::endl;
357 return status;
358 }
359 status = CompareOutput();
360 if (status != RET_OK) {
361 MS_LOG(ERROR) << "Compare output error " << status;
362 std::cerr << "Compare output error " << status << std::endl;
363 return status;
364 }
365 return RET_OK;
366 }
367
PrintInputData()368 int BenchmarkUnifiedApi::PrintInputData() {
369 for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
370 auto input = ms_inputs_for_api_[i];
371 MS_ASSERT(input != nullptr);
372 auto tensor_data_type = static_cast<int>(input.DataType());
373
374 std::cout << "InData" << i << ": ";
375 if (tensor_data_type == TypeId::kObjectTypeString) {
376 std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
377 size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
378 for (size_t j = 0; j < print_num; j++) {
379 std::cout << output_strings[j] << std::endl;
380 }
381 continue;
382 }
383 size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
384 const void *in_data = input.MutableData();
385 if (in_data == nullptr) {
386 MS_LOG(ERROR) << "in_data is nullptr.";
387 return RET_ERROR;
388 }
389
390 for (size_t j = 0; j < print_num; j++) {
391 if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
392 std::cout << static_cast<const float *>(in_data)[j] << " ";
393 } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
394 std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
395 } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
396 std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
397 } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
398 std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
399 } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
400 std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
401 } else if (tensor_data_type == TypeId::kNumberTypeBool) {
402 std::cout << static_cast<const bool *>(in_data)[j] << " ";
403 } else {
404 MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
405 return RET_ERROR;
406 }
407 }
408 std::cout << std::endl;
409 }
410 return RET_OK;
411 }
412
RunBenchmark()413 int BenchmarkUnifiedApi::RunBenchmark() {
414 auto start_prepare_time = GetTimeUs();
415 // Load graph
416 std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
417
418 MS_LOG(INFO) << "start reading model file";
419 std::cout << "start reading model file" << std::endl;
420 size_t size = 0;
421 char *graph_buf = ReadFile(flags_->model_file_.c_str(), &size);
422 if (graph_buf == nullptr) {
423 MS_LOG(ERROR) << "Read model file failed while running " << model_name.c_str();
424 std::cerr << "Read model file failed while running " << model_name.c_str() << std::endl;
425 return RET_ERROR;
426 }
427
428 auto context = std::make_shared<mindspore::Context>();
429 if (context == nullptr) {
430 MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
431 std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
432 return RET_ERROR;
433 }
434
435 (void)InitMSContext(context);
436
437 if (!flags_->config_file_.empty()) {
438 auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
439 if (config_ret != kSuccess) {
440 MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
441 std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
442 }
443 }
444
445 auto ret = ms_model_.Build(graph_buf, size, kMindIR, context);
446 delete[] graph_buf;
447 if (ret != kSuccess) {
448 MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
449 std::cout << "ms_model_.Build failed while running ", model_name.c_str();
450 return RET_ERROR;
451 }
452
453 if (!flags_->resize_dims_.empty()) {
454 std::vector<std::vector<int64_t>> resize_dims;
455 (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
456 [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
457
458 ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
459 if (ret != kSuccess) {
460 MS_LOG(ERROR) << "Input tensor resize failed.";
461 std::cout << "Input tensor resize failed.";
462 return RET_ERROR;
463 }
464 }
465
466 ms_inputs_for_api_ = ms_model_.GetInputs();
467 auto end_prepare_time = GetTimeUs();
468 MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
469 std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;
470
471 // Load input
472 MS_LOG(INFO) << "start generate input data";
473 auto status = LoadInput();
474 if (status != 0) {
475 MS_LOG(ERROR) << "Generate input data error";
476 return status;
477 }
478 if (!flags_->benchmark_data_file_.empty()) {
479 status = MarkAccuracy();
480 if (status != 0) {
481 MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
482 std::cout << "Run MarkAccuracy error: " << status << std::endl;
483 return status;
484 }
485 } else {
486 status = MarkPerformance();
487 if (status != 0) {
488 MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
489 std::cout << "Run MarkPerformance error: " << status << std::endl;
490 return status;
491 }
492 }
493 if (flags_->dump_tensor_data_) {
494 std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
495 }
496 return RET_OK;
497 }
498
InitTimeProfilingCallbackParameter()499 int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
500 // before callback
501 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
502 const std::vector<mindspore::MSTensor> &before_outputs,
503 const MSCallBackParam &call_param) {
504 if (before_inputs.empty()) {
505 MS_LOG(INFO) << "The num of beforeInputs is empty";
506 }
507 if (before_outputs.empty()) {
508 MS_LOG(INFO) << "The num of beforeOutputs is empty";
509 }
510 if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
511 op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
512 }
513 if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
514 op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
515 }
516
517 op_call_times_total_++;
518 op_begin_ = GetTimeUs();
519 return true;
520 };
521
522 // after callback
523 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
524 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
525 uint64_t opEnd = GetTimeUs();
526
527 if (after_inputs.empty()) {
528 MS_LOG(INFO) << "The num of after inputs is empty";
529 }
530 if (after_outputs.empty()) {
531 MS_LOG(INFO) << "The num of after outputs is empty";
532 }
533
534 float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
535 if (flags_->device_ == "GPU") {
536 auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
537 cost = static_cast<float>(gpu_param.execute_time);
538 }
539 op_cost_total_ += cost;
540 op_times_by_type_[call_param.node_type].first++;
541 op_times_by_type_[call_param.node_type].second += cost;
542 op_times_by_name_[call_param.node_name].first++;
543 op_times_by_name_[call_param.node_name].second += cost;
544 return true;
545 };
546 return RET_OK;
547 }
548
InitPerfProfilingCallbackParameter()549 int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
550 #ifndef ENABLE_ARM64
551 MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
552 return RET_ERROR;
553 #else
554 struct perf_event_attr pe, pe2;
555 memset(&pe, 0, sizeof(struct perf_event_attr));
556 memset(&pe2, 0, sizeof(struct perf_event_attr));
557 pe.type = PERF_TYPE_HARDWARE;
558 pe2.type = PERF_TYPE_HARDWARE;
559 pe.size = sizeof(struct perf_event_attr);
560 pe2.size = sizeof(struct perf_event_attr);
561 pe.disabled = 1;
562 pe2.disabled = 1;
563 pe.exclude_kernel = 1; // don't count kernel
564 pe2.exclude_kernel = 1; // don't count kernel
565 pe.exclude_hv = 1; // don't count hypervisor
566 pe2.exclude_hv = 1; // don't count hypervisor
567 pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
568 pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
569 if (flags_->perf_event_ == "CACHE") {
570 pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
571 pe2.config = PERF_COUNT_HW_CACHE_MISSES;
572 } else if (flags_->perf_event_ == "STALL") {
573 pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
574 pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
575 } else {
576 pe.config = PERF_COUNT_HW_CPU_CYCLES;
577 pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
578 }
579 perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
580 if (perf_fd == -1) {
581 MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
582 return RET_ERROR;
583 }
584 perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
585 if (perf_fd2 == -1) {
586 MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
587 return RET_ERROR;
588 }
589 struct PerfCount zero;
590 zero.value[0] = 0;
591 zero.value[1] = 0;
592 // before callback
593 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
594 const std::vector<mindspore::MSTensor> &before_outputs,
595 const MSCallBackParam &call_param) {
596 if (before_inputs.empty()) {
597 MS_LOG(INFO) << "The num of beforeInputs is empty";
598 }
599 if (before_outputs.empty()) {
600 MS_LOG(INFO) << "The num of beforeOutputs is empty";
601 }
602 if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
603 op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
604 }
605 if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
606 op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
607 }
608
609 op_call_times_total_++;
610 ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
611 ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
612 return true;
613 };
614
615 // after callback
616 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
617 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
618 struct PerfResult res;
619 ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
620 if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
621 MS_LOG(ERROR) << "Failed to read perf_fd";
622 return false;
623 }
624
625 if (after_inputs.empty()) {
626 MS_LOG(INFO) << "The num of after inputs is empty";
627 }
628 if (after_outputs.empty()) {
629 MS_LOG(INFO) << "The num of after outputs is empty";
630 }
631 float cost1 = static_cast<float>(res.values[0].value);
632 float cost2 = static_cast<float>(res.values[1].value);
633 op_cost_total_ += cost1;
634 op_cost2_total_ += cost2;
635 op_perf_by_type_[call_param.node_type].first++;
636 op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
637 op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
638 op_perf_by_name_[call_param.node_name].first++;
639 op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
640 op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
641 return true;
642 };
643 #endif
644 return RET_OK;
645 }
646
647 namespace {
648 template <typename T>
DataToString(void * data,size_t data_number)649 std::string DataToString(void *data, size_t data_number) {
650 if (data == nullptr) {
651 return "Data of tensor is nullptr";
652 }
653 std::ostringstream oss;
654 auto casted_data = static_cast<T *>(data);
655 for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
656 oss << " " << casted_data[i];
657 }
658 return oss.str();
659 }
660
DumpMSTensor(mindspore::MSTensor * tensor)661 std::string DumpMSTensor(mindspore::MSTensor *tensor) {
662 if (tensor == nullptr) {
663 return "Tensor is nullptr";
664 }
665 std::ostringstream oss;
666 oss << " DataType: " << static_cast<int>(tensor->DataType());
667 oss << " Shape:";
668 for (auto &dim : tensor->Shape()) {
669 oss << " " << dim;
670 }
671 oss << std::endl << " Data:";
672 switch (static_cast<int>(tensor->DataType())) {
673 case kNumberTypeFloat32: {
674 oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
675 } break;
676 case kNumberTypeFloat16: {
677 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
678 } break;
679 case kNumberTypeInt32: {
680 oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
681 } break;
682 case kNumberTypeInt16: {
683 oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
684 } break;
685 case kNumberTypeInt8: {
686 oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
687 } break;
688 default:
689 oss << "Unsupported data type to print";
690 break;
691 }
692 return oss.str();
693 }
694 #ifndef BENCHMARK_CLIP_JSON
GenerateOutputFileName(mindspore::MSTensor * tensor,const std::string & op_name,const std::string & file_type,const size_t & idx)695 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
696 const std::string &file_type, const size_t &idx) {
697 std::string file_name = op_name;
698 auto pos = file_name.find_first_of('/');
699 while (pos != std::string::npos) {
700 file_name.replace(pos, 1, ".");
701 pos = file_name.find_first_of('/');
702 }
703 file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
704 for (const auto &dim : tensor->Shape()) {
705 file_name += std::to_string(dim) + "_";
706 }
707 if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
708 file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
709 }
710
711 auto tensor_format = tensor->format();
712 if (kTensorFormatMap.find(tensor_format) != kTensorFormatMap.end()) {
713 file_name += "_" + kTensorFormatMap.at(tensor_format) + ".bin";
714 }
715
716 file_name += +".bin";
717 return file_name;
718 }
719 #endif
720 } // namespace
721
InitPrintTensorDataCallbackParameter()722 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
723 // before callback
724 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
725 const std::vector<mindspore::MSTensor> &before_outputs,
726 const MSCallBackParam &call_param) { return true; };
727
728 // after callback
729 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
730 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
731 std::cout << "================================================================" << std::endl;
732 std::cout << call_param.node_name << " inputs : " << std::endl;
733 for (auto ms_tensor : after_inputs) {
734 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
735 }
736 std::cout << "----------------------------------------------------------------" << std::endl;
737 std::cout << call_param.node_name << " outputs : " << std::endl;
738 for (auto ms_tensor : after_outputs) {
739 std::cout << DumpMSTensor(&ms_tensor) << std::endl;
740 }
741 std::cout << "================================================================" << std::endl;
742 return true;
743 };
744 return RET_OK;
745 }
InitDumpTensorDataCallbackParameter()746 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
747 #ifndef BENCHMARK_CLIP_JSON
748 // before callback
749 ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
750 const std::vector<mindspore::MSTensor> &before_outputs,
751 const MSCallBackParam &call_param) {
752 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
753 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
754 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
755 if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
756 if (input_output_mode == 0 || input_output_mode == 1) {
757 for (size_t i = 0; i < before_inputs.size(); i++) {
758 auto ms_tensor = before_inputs.at(i);
759 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
760 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
761 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
762 MS_LOG(ERROR) << "write tensor data to file failed.";
763 return false;
764 }
765 }
766 }
767 }
768 return true;
769 };
770
771 // after callback
772 ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
773 const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
774 auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
775 auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
776 auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
777 if (dump_mode == kDumpInputsAndOutputs ||
778 std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
779 if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
780 for (size_t i = 0; i < after_outputs.size(); i++) {
781 auto ms_tensor = after_outputs.at(i);
782 auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
783 auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
784 if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
785 MS_LOG(ERROR) << "write tensor data to file failed.";
786 return false;
787 }
788 }
789 }
790 }
791 return true;
792 };
793 #endif
794 return RET_OK;
795 }
796
~BenchmarkUnifiedApi()797 BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {}
798 } // namespace lite
799 } // namespace mindspore
800