• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "profiler/device/data_saver.h"
17 #include <fstream>
18 #include <numeric>
19 #include "sys/stat.h"
20 #include "utils/ms_utils.h"
21 #include "utils/ms_context.h"
22 
23 namespace mindspore {
24 namespace profiler {
OpDetailInfo(const std::shared_ptr<OpInfo> op_info,float proportion)25 OpDetailInfo::OpDetailInfo(const std::shared_ptr<OpInfo> op_info, float proportion)
26     : op_info_(op_info), proportion_(proportion) {
27   // op_full_name is like 'xxx/xxx/{op_type}-op{node_id}'
28   op_full_name_ = op_info->op_name;
29   auto op_type_begin_iter = op_full_name_.rfind('/') + 1;
30   auto op_type_end_iter = op_full_name_.rfind('-');
31   op_type_ = op_full_name_.substr(op_type_begin_iter, op_type_end_iter - op_type_begin_iter);
32   op_name_ = op_full_name_.substr(op_type_begin_iter);
33   if (op_info->op_count == 0) {
34     MS_LOG(ERROR) << "The num of operations can not be 0.";
35     return;
36   }
37   op_avg_time_ = op_info->op_host_cost_time / op_info->op_count;
38 }
39 
ParseOpInfo(const OpInfoMap & op_info_maps)40 void DataSaver::ParseOpInfo(const OpInfoMap &op_info_maps) {
41   op_detail_infos_.reserve(op_info_maps.size());
42   float total_time_sum = GetTotalOpTime(op_info_maps);
43   for (auto item : op_info_maps) {
44     op_timestamps_map_[item.first] = item.second.start_duration;
45     if (total_time_sum == 0.0) {
46       MS_LOG(ERROR) << "The total operation times can not be 0.";
47       return;
48     }
49     float proportion = item.second.op_host_cost_time / total_time_sum;
50     auto op_info = std::make_shared<OpInfo>(item.second);
51     if (op_info == nullptr) {
52       MS_LOG(ERROR) << "Create Operation information node failed when parse operation information.";
53       return;
54     }
55     OpDetailInfo op_detail_info = OpDetailInfo(op_info, proportion);
56     op_detail_infos_.emplace_back(op_detail_info);
57     AddOpDetailInfoForType(op_detail_info);
58   }
59   // update average time of op type
60   for (auto &op_type : op_type_infos_) {
61     // device_infos: <type_name, op_type_info>
62     if (op_type.second.count_ == 0) {
63       MS_LOG(ERROR) << "The num of operation type can not be 0.";
64       return;
65     }
66     op_type.second.avg_time_ = op_type.second.total_time_ / op_type.second.count_;
67   }
68   MS_LOG(DEBUG) << "Get " << op_detail_infos_.size() << " operation items.";
69   MS_LOG(DEBUG) << "Get " << op_type_infos_.size() << " operation type items.";
70 }
71 
AddOpDetailInfoForType(const OpDetailInfo & op_detail_info)72 void DataSaver::AddOpDetailInfoForType(const OpDetailInfo &op_detail_info) {
73   // Construct OpType object according to op detail info
74   OpType op_type = OpType{op_detail_info.op_type_,
75                           op_detail_info.op_info_->op_count,
76                           op_detail_info.op_info_->op_count,
77                           op_detail_info.op_info_->op_host_cost_time,
78                           0,
79                           op_detail_info.proportion_};
80   // Set the OpType into op_type_infos_ map
81   std::string type_name = op_detail_info.op_type_;
82   auto iter = op_type_infos_.find(type_name);
83   if (iter == op_type_infos_.end()) {
84     op_type_infos_.emplace(type_name, op_type);
85   } else {
86     iter->second += op_type;
87   }
88 }
89 
GetTotalOpTime(const OpInfoMap & op_info_maps) const90 float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) const {
91   float sum = 0;
92   sum = std::accumulate(op_info_maps.begin(), op_info_maps.end(), sum,
93                         [](float i, auto iter) { return i + iter.second.op_host_cost_time; });
94   MS_LOG(DEBUG) << "The total op time is " << sum;
95   return sum;
96 }
97 
WriteOpType(const std::string & saver_base_dir) const98 void DataSaver::WriteOpType(const std::string &saver_base_dir) const {
99   std::string file_path = saver_base_dir + "/" + op_side_ + "_op_type_info_" + device_id_ + ".csv";
100   std::ofstream ofs(file_path);
101   // check if the file is writable
102   if (!ofs.is_open()) {
103     MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
104     return;
105   }
106   try {
107     // write op type info into file
108     if (op_side_ == "cpu") {
109       ofs << OpType().GetCpuHeader() << std::endl;
110       for (auto op_type_info : op_type_infos_) {
111         op_type_info.second.OutputCpuOpTypeInfo(ofs);
112       }
113     }
114     if (op_side_ == "gpu") {
115       ofs << OpType().GetGpuHeader() << std::endl;
116       for (auto op_type_info : op_type_infos_) {
117         op_type_info.second.OutputGpuOpTypeInfo(ofs);
118       }
119     }
120   } catch (const std::exception &e) {
121     MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
122   }
123   ofs.close();
124   ChangeFileMode(file_path);
125   MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path;
126 }
127 
WriteOpDetail(const std::string & saver_base_dir) const128 void DataSaver::WriteOpDetail(const std::string &saver_base_dir) const {
129   std::string file_path = saver_base_dir + "/" + op_side_ + "_op_detail_info_" + device_id_ + ".csv";
130   std::ofstream ofs(file_path);
131   if (!ofs.is_open()) {
132     MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
133     return;
134   }
135   try {
136     // write op detail info into file
137     if (op_side_ == "cpu") {
138       ofs << OpDetailInfo().GetCpuHeader() << std::endl;
139       for (auto op_detail : op_detail_infos_) {
140         op_detail.OutputCpuOpDetailInfo(ofs);
141       }
142     }
143     if (op_side_ == "gpu") {
144       ofs << OpDetailInfo().GetGpuHeader() << std::endl;
145       for (auto op_detail : op_detail_infos_) {
146         op_detail.OutputGpuOpDetailInfo(ofs);
147       }
148     }
149   } catch (const std::exception &e) {
150     MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
151   }
152   ofs.close();
153   ChangeFileMode(file_path);
154   MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path;
155 }
156 
WriteOpTimestamp(const std::string & saver_base_dir) const157 void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) const {
158   std::string file_path = saver_base_dir + "/" + op_side_ + "_op_execute_timestamp_" + device_id_ + ".txt";
159   std::ofstream ofs(file_path);
160   // check if the file is writable
161   if (!ofs.is_open()) {
162     MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
163     return;
164   }
165   try {
166     // write op timestamp info into file
167     for (const auto &op_timestamp_info : op_timestamps_map_) {
168       if (op_side_ == "cpu") {
169         ofs << op_timestamp_info.first << ";HostCpuOps;";
170       } else {
171         ofs << op_timestamp_info.first << ";GpuOps;";
172       }
173       for (auto start_end : op_timestamp_info.second) {
174         ofs << start_end.start_timestamp << "," << start_end.duration << " ";
175       }
176       ofs << std::endl;
177     }
178   } catch (const std::exception &e) {
179     MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
180   }
181   ofs.close();
182   ChangeFileMode(file_path);
183 }
184 
ChangeFileMode(const std::string & file_path) const185 void DataSaver::ChangeFileMode(const std::string &file_path) const {
186   if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
187     MS_LOG(WARNING) << "Modify file: " << file_path << " to rw fail.";
188     return;
189   }
190 }
191 }  // namespace profiler
192 }  // namespace mindspore
193