1 /** 2 * Copyright 2021-2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_ 18 19 #include <memory> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 #include <nlohmann/json.hpp> 24 #include "minddata/dataset/engine/perf/profiling.h" 25 #include "minddata/dataset/engine/datasetops/dataset_op.h" 26 27 namespace mindspore { 28 namespace dataset { 29 30 class ExecutionTree; 31 32 typedef struct SystemStat_s { 33 uint64_t user_stat; 34 uint64_t sys_stat; 35 uint64_t io_stat; 36 uint64_t idle_stat; 37 uint64_t total_stat; 38 } SystemStat; 39 40 typedef struct SystemUtil_s { 41 uint8_t user_utilization; 42 uint8_t sys_utilization; 43 uint8_t io_utilization; 44 uint8_t idle_utilization; 45 } SystemUtil; 46 47 typedef struct TaskStat_s { 48 uint64_t user_stat; 49 uint64_t sys_stat; 50 } TaskStat; 51 52 struct TaskUtil_s { 53 float user_utilization; 54 float sys_utilization; 55 }; 56 57 typedef struct MemoryInfo_s { 58 float vss; 59 float rss; 60 float pss; 61 } MemoryInfo; 62 63 typedef struct SystemMemInfo_s { 64 float total_mem; 65 float available_mem; 66 float used_mem; 67 } SystemMemInfo; 68 69 typedef struct TaskUtil_s TaskUtil; 70 typedef struct TaskUtil_s OpUtil; 71 72 class SystemInfo { 73 public: SystemInfo()74 SystemInfo() 75 : first_sample_(true), 76 prev_context_switch_count_(0), 77 last_mem_sampling_failed_(false), 78 prev_system_memory_info_({0, 0, 0}) {} 79 // Read in current stats and return previous and currently read stats 80 Status SampleAndGetCurrPrevStat(SystemStat *current_stat, SystemStat *previous_stat); 81 static int32_t num_cpu_; GetRunningProcess()82 const std::vector<uint32_t> &GetRunningProcess() const { return running_process_; } GetContextSwitchCount()83 const std::vector<uint64_t> &GetContextSwitchCount() const { return context_switch_count_; } 84 Status GetUserCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const; 85 Status GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const; 86 std::vector<uint8_t> GetIOCpuUtil() const; 87 std::vector<uint8_t> GetIdleCpuUtil() const; 88 Status GetSystemMemInfo(SystemMemoryMetric metric, uint64_t start_index, uint64_t end_index, 89 std::vector<float> *result) const; 90 91 private: 92 Status ParseCpuInfo(const std::string &str); 93 Status ParseCtxt(const std::string &str); 94 Status ParseRunningProcess(const std::string &str); 95 Status SampleSystemMemInfo(); 96 SystemStat prev_sys_stat_{}; // last read data /proc/stat file 97 std::vector<SystemUtil> sys_cpu_util_; // vector of system cpu utilization 98 std::vector<uint32_t> running_process_; // vector of running processes in system 99 std::vector<uint64_t> context_switch_count_; // vector of number of context switches between two sampling points 100 bool first_sample_; // flag to indicate first time sampling 101 uint64_t prev_context_switch_count_; // last read context switch count from /proc/stat file 102 std::vector<SystemMemInfo> system_memory_info_; 103 SystemMemInfo prev_system_memory_info_; 104 bool last_mem_sampling_failed_; 105 }; 106 107 class TaskCpuInfo { 108 public: TaskCpuInfo(pid_t pid)109 explicit TaskCpuInfo(pid_t pid) 110 : pid_(pid), prev_task_stat_(TaskStat{0, 0}), first_sample_(true), last_sampling_failed_(false) {} 111 virtual ~TaskCpuInfo() = default; 112 virtual Status Sample(uint64_t total_time_elapsed) = 0; 113 virtual pid_t GetId() = 0; 114 TaskUtil GetLatestCpuUtil() const; 115 std::vector<uint16_t> GetSysCpuUtil() const; 116 std::vector<uint16_t> GetUserCpuUtil() const; 117 118 protected: 119 pid_t pid_; 120 TaskStat prev_task_stat_; 121 std::vector<TaskUtil> task_cpu_util_; 122 bool first_sample_; 123 bool last_sampling_failed_; 124 }; 125 126 class ProcessInfo : public TaskCpuInfo { 127 public: 128 explicit ProcessInfo(pid_t pid, bool track_history = false) TaskCpuInfo(pid)129 : TaskCpuInfo(pid), 130 prev_memory_info_(MemoryInfo{0.0, 0.0, 0.0}), 131 last_mem_sampling_failed_(false), 132 track_sampled_history_(track_history) {} 133 ~ProcessInfo() override = default; 134 Status Sample(uint64_t total_time_elapsed) override; GetId()135 pid_t GetId() override { return pid_; } 136 Status GetMemoryInfo(ProcessMemoryMetric metric, uint64_t start_index, uint64_t end_index, 137 std::vector<float> *result) const; 138 void AddChildProcess(const std::shared_ptr<ProcessInfo> &child_ptr); 139 140 private: 141 Status SampleMemInfo(); 142 MemoryInfo GetLatestMemoryInfo() const; 143 MemoryInfo prev_memory_info_; 144 std::vector<MemoryInfo> process_memory_info_; 145 std::vector<std::shared_ptr<ProcessInfo>> child_processes_; 146 bool IsParent(); 147 bool last_mem_sampling_failed_; 148 bool track_sampled_history_; 149 }; 150 151 class ThreadCpuInfo : public TaskCpuInfo { 152 public: ThreadCpuInfo(pid_t pid,pid_t tid)153 explicit ThreadCpuInfo(pid_t pid, pid_t tid) : TaskCpuInfo(pid), tid_(tid) {} 154 ~ThreadCpuInfo() override = default; 155 Status Sample(uint64_t total_time_elapsed) override; GetId()156 pid_t GetId() override { return tid_; } 157 158 private: 159 pid_t tid_; 160 }; 161 162 class MDOperatorCpuInfo { 163 public: 164 void AddTask(const std::shared_ptr<TaskCpuInfo> &task_ptr); 165 bool TaskExists(pid_t id) const; MDOperatorCpuInfo(const int32_t op_id)166 explicit MDOperatorCpuInfo(const int32_t op_id) : id_(op_id) {} 167 void CalculateOperatorUtilization(); 168 Status GetUserCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint16_t> *result) const; 169 Status GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint16_t> *result) const; 170 171 private: 172 int32_t id_; 173 // tid is key for threadinfo, pid is key for processinfo 174 std::unordered_map<pid_t, std::shared_ptr<TaskCpuInfo>> task_by_id_; 175 std::vector<OpUtil> op_cpu_util_; 176 }; 177 178 class CpuSampler : public Sampling { 179 using Timestamps = std::vector<uint64_t>; 180 181 public: CpuSampler(ExecutionTree * tree)182 explicit CpuSampler(ExecutionTree *tree) : fetched_all_python_multiprocesses_(false), tree(tree) {} 183 ~CpuSampler() = default; 184 Status Sample() override; 185 Status Init() override; 186 Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) override; 187 Status SaveToFile(const std::string &dir_path, const std::string &rank_id) override; Name()188 std::string Name() const override { return kCpuSamplerName; } 189 Status GetSystemUserCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result); 190 Status GetSystemSysCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result); 191 Status GetOpUserCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result); 192 Status GetOpSysCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result); 193 Status GetProcessMemoryInfo(ProcessMemoryMetric metric, uint64_t start_index, uint64_t end_index, 194 std::vector<float> *result); 195 Status GetSystemMemoryInfo(SystemMemoryMetric metric, uint64_t start_index, uint64_t end_index, 196 std::vector<float> *result); 197 198 // Clear all collected data 199 void Clear() override; 200 201 private: 202 Status UpdateTaskList(); 203 bool fetched_all_python_multiprocesses_{}; 204 ExecutionTree *tree = nullptr; 205 pid_t main_pid_{}; 206 Timestamps ts_; 207 SystemInfo sys_info_; // stores the system cpu utilization 208 std::vector<std::shared_ptr<TaskCpuInfo>> tasks_; // vector of all process and thread tasks 209 std::shared_ptr<ThreadCpuInfo> main_thread_cpu_info_; 210 std::shared_ptr<ProcessInfo> main_process_info_; 211 std::unordered_map<int32_t, MDOperatorCpuInfo> op_info_by_id_; 212 Path GetFileName(const std::string &dir_path, const std::string &rank_id) override; 213 }; 214 } // namespace dataset 215 } // namespace mindspore 216 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_ 217