1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H 18 19 #include <memory> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 #include <nlohmann/json.hpp> 24 #include "minddata/dataset/engine/perf/profiling.h" 25 #include "minddata/dataset/engine/datasetops/dataset_op.h" 26 27 namespace mindspore { 28 namespace dataset { 29 class ExecutionTree; 30 31 // CPU information from /proc/stat or /proc/pid/stat file 32 typedef struct CpuStat_s { 33 uint64_t user_stat_; 34 uint64_t sys_stat_; 35 uint64_t io_stat_; 36 uint64_t idle_stat_; 37 uint64_t total_stat_; 38 } CpuStat; 39 40 // Cpu utilization 41 typedef struct CpuInfo_s { 42 uint8_t user_utilization_; 43 uint8_t sys_utilization_; 44 uint8_t io_utilization_; 45 uint8_t idle_utilization_; 46 } CpuUtil; 47 48 // CPU utilization of operator 49 typedef struct CpuOpInfo_s { 50 float user_utilization_; 51 float sys_utilization_; 52 int32_t op_id_; 53 } CpuOpUtil; 54 55 // CPU utilization of process 56 typedef struct CpuProcessInfo_s { 57 float user_utilization_; 58 float sys_utilization_; 59 } CpuProcessUtil; 60 61 // CPU stat of operator 62 typedef struct CpuOpStat_s { 63 uint64_t user_stat_; 64 uint64_t sys_stat_; 65 } CpuOpStat; 66 67 class BaseCpu { 68 public: 69 BaseCpu(); 70 ~BaseCpu() = default; 71 // Collect CPU information 72 virtual Status Collect(const ExecutionTree *tree) = 0; 73 virtual Status SaveToFile(const std::string &file_path) = 0; 74 virtual Status Analyze(std::string *name, double *utilization, std::string *extra_message) = 0; 75 // Get the total CPU time of device 76 Status GetTotalCpuTime(uint64_t *total_stat); 77 78 protected: 79 std::vector<CpuUtil> cpu_util_; 80 CpuStat pre_cpu_stat_; 81 static bool fetched_all_process_shared_; 82 static std::unordered_map<int32_t, std::vector<pid_t>> op_process_shared_; 83 bool fetched_all_process_; 84 bool pre_fetched_state_; 85 std::unordered_map<int32_t, std::vector<pid_t>> op_process_; 86 int32_t cpu_processor_num_; 87 }; 88 89 // Collect device CPU information 90 class DeviceCpu : public BaseCpu { 91 public: DeviceCpu()92 DeviceCpu() : pre_running_process_(0), pre_context_switch_count_(0), first_collect_(true) {} 93 ~DeviceCpu() = default; 94 Status Collect(const ExecutionTree *tree) override; 95 Status SaveToFile(const std::string &file_path) override; 96 Status Analyze(std::string *name, double *utilization, std::string *extra_message) override; 97 98 private: 99 // Get CPU information, include use/sys/idle/io utilization 100 Status ParseCpuInfo(const std::string &str); 101 102 // Get context switch count 103 Status ParseCtxt(const std::string &str); 104 105 // Get running process count 106 Status ParseRunningProcess(const std::string &str); 107 108 std::vector<uint32_t> running_process_; 109 std::vector<uint64_t> context_switch_count_; 110 uint32_t pre_running_process_; 111 uint64_t pre_context_switch_count_; 112 bool first_collect_; 113 }; 114 115 // Collect operator CPU information 116 class OperatorCpu : public BaseCpu { 117 public: OperatorCpu()118 OperatorCpu() : first_collect_(true), pre_total_stat_(0), id_count_(0) {} 119 ~OperatorCpu() = default; 120 Status Collect(const ExecutionTree *tree) override; 121 Status SaveToFile(const std::string &file_path) override; 122 // Analyze will output the name of the metric, the avg utiliization of highest 123 // object within the class and any extra message that would be useful for the user. 124 // The Higher level CPUSampling class will combine information from different classes 125 // to decide if warning should be output. 126 Status Analyze(std::string *name, double *utilization, std::string *extra_message) override; 127 128 private: 129 // Get cpu information, include use/sys/idle/io utilization 130 Status ParseCpuInfo(int32_t op_id, int64_t thread_id, 131 std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> *op_stat); 132 133 // Store the CPU utilization of each operator 134 std::vector<std::vector<CpuOpUtil>> cpu_op_util_; 135 136 bool first_collect_; 137 138 // Store the id and its corresponding threads. 139 std::unordered_map<int32_t, std::vector<pid_t>> op_thread_; 140 std::unordered_map<int32_t, std::string> op_name_; 141 std::unordered_map<int32_t, int32_t> op_parallel_workers_; 142 std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> pre_op_stat_; 143 uint64_t pre_total_stat_; 144 int32_t id_count_; 145 }; 146 147 // Collect operator CPU information 148 class ProcessCpu : public BaseCpu { 149 public: ProcessCpu()150 ProcessCpu() : first_collect_(true), pre_total_stat_(0) {} 151 ~ProcessCpu() = default; 152 Status Collect(const ExecutionTree *tree) override; 153 Status SaveToFile(const std::string &file_path) override; 154 Status Analyze(std::string *name, double *utilization, std::string *extra_message) override; 155 156 private: 157 // Get CPU information, include use/sys/idle/io utilization 158 Status ParseCpuInfo(); 159 160 bool first_collect_; 161 std::vector<CpuProcessUtil> process_util_; 162 uint64_t pre_total_stat_; 163 std::unordered_map<int64_t, CpuOpStat> pre_process_stat_; 164 std::vector<pid_t> process_id_; 165 }; 166 167 // Sampling CPU information 168 // It support JSON serialization for external usage. 169 class CpuSampling : public Sampling { 170 using TimeStamp = std::vector<uint32_t>; 171 172 public: CpuSampling(ExecutionTree * tree)173 explicit CpuSampling(ExecutionTree *tree) : tree_(tree) {} 174 175 ~CpuSampling() = default; 176 177 // Driver function for CPU sampling. 178 // This function samples the CPU information of device/process/op 179 Status Sample() override; 180 Name()181 std::string Name() const override { return kCpuSamplingName; } 182 183 // Save sampling data to file 184 // @return Status - The error code return 185 Status SaveToFile() override; 186 187 Status Init(const std::string &dir_path, const std::string &device_id) override; 188 189 // Change file mode after save CPU data 190 Status ChangeFileMode() override; 191 192 // Analyze sampling data and print message to log 193 Status Analyze() override; 194 195 private: 196 Status CollectTimeStamp(); 197 198 Status SaveTimeStampToFile(); 199 200 Status SaveSamplingItervalToFile(); 201 202 ExecutionTree *tree_ = nullptr; // ExecutionTree pointer 203 std::vector<std::shared_ptr<BaseCpu>> cpu_; // CPU information of device/process/op 204 TimeStamp time_stamp_; // Time stamp 205 }; 206 207 } // namespace dataset 208 } // namespace mindspore 209 210 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H 211