1 /** 2 * Copyright 2020-2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_ 18 19 #include <atomic> 20 #include <chrono> 21 #include <memory> 22 #include <mutex> 23 #include <string> 24 #include <unordered_map> 25 #include <vector> 26 #include <nlohmann/json.hpp> 27 #include "minddata/dataset/util/path.h" 28 #include "minddata/dataset/util/status.h" 29 #include "minddata/dataset/engine/perf/monitor.h" 30 31 namespace mindspore { 32 namespace dataset { 33 34 class Monitor; 35 class ExecutionTree; 36 class TreeConsumer; 37 class CpuSampler; 38 class TreeAdapter; 39 40 const char kDeviceQueueTracingName[] = "Device_Queue_Tracing"; 41 const char kDatasetIteratorTracingName[] = "Dataset_Iterator_Tracing"; 42 const char kConnectorSizeSamplingName[] = "Connector_Size_Sampling"; 43 const char kCpuSamplerName[] = "Cpu_Sampler"; 44 45 // Values for process memory metrics - common for profiling and cpu_sampler 46 enum ProcessMemoryMetric { kPSS, kRSS, kVSS }; 47 48 // Values for system memory metrics - common for profiling and cpu_sampler 49 enum SystemMemoryMetric { kMemoryAvailable, kMemoryTotal, kMemoryUsed }; 50 51 // Profiling is a class of basic unit of profiling action 52 // This base class encapsulate the serialization output logic 53 class Profiling : public std::enable_shared_from_this<Profiling> { 54 public: 55 // Constructor Profiling()56 Profiling() : active_(false) {} 57 58 // Destructor 59 virtual ~Profiling() = default; 60 61 virtual Status Init() = 0; 62 63 // Default serialization file generator 64 virtual Status SaveToFile(const std::string &dir_path, const std::string &rank_id) = 0; 65 66 // Profiling name 67 virtual std::string Name() const = 0; 68 69 virtual Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) = 0; 70 71 // Start collecting data 72 Status Start(); 73 74 // Stop collecting data 75 Status Stop(); 76 77 // Clear all collected data 78 virtual void Clear() = 0; 79 80 protected: 81 bool active_; // show current state of ProfilingManager (running, or paused) 82 std::mutex lock_; 83 virtual Path GetFileName(const std::string &dir_path, const std::string &rank_id) = 0; 84 }; 85 86 // Sampling is a class of profiling which generate samples periodically. 87 class Sampling : public Profiling { 88 public: 89 // Sampling action function. This function will be invoked by performance monitor thread. 90 virtual Status Sample() = 0; 91 92 ~Sampling() override = default; 93 }; 94 95 typedef struct TracingRecord_s { 96 int32_t type; 97 int32_t extra_info; 98 int32_t batch_num; 99 int32_t value; 100 uint64_t ts; 101 ToStringTracingRecord_s102 std::string ToString() const { 103 return std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " + 104 std::to_string(value) + " " + std::to_string(ts); 105 } 106 } TracingRecord; 107 108 // Tracing is class of profiling which record samples upon request. 109 class Tracing : public Profiling { 110 public: 111 // Tracing has minimal interface to provide flexible on data recording. 112 // It only includes some common routines. 113 Status SaveToFile(const std::string &dir_path, const std::string &rank_id) override; 114 Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) override; 115 Status Init() override; 116 Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 117 Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 118 Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 119 Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 120 Status GetConnectorCapacity(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 121 Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq); 122 void Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value, 123 const uint64_t time_stamp); 124 Status TimeIntervalForStepRange(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts); 125 Status StepIntervalForTimeRange(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step); 126 size_t GetNumberSteps(); 127 128 // Clear all collected data 129 void Clear() override; 130 131 protected: 132 Tracing() = default; 133 std::vector<std::string> value_; 134 std::vector<TracingRecord> records_; 135 std::vector<uint64_t> ts_; // End time of each step or batch 136 Status GetRecordEntryFieldValue(int32_t start_step, int32_t end_step, int32_t record_offset, const std::string &field, 137 std::vector<int32_t> *result); 138 }; 139 140 // ProfilingManager is a class manages all profiling infrastructure 141 // It serves the following purposes: 142 // 1) Fetch profiling configs from global contexts 143 // 2) Setup all profiling node based on config 144 // 3) Provide access of profiling nodes for profiling actions 145 // 4) Manage profiling data serialization process 146 class ProfilingManager { 147 friend Monitor; 148 149 public: 150 ProfilingManager(); 151 152 ~ProfilingManager() = default; 153 154 /// Register the given tree to be profiled. 155 /// This method should be called once, calling it for another tree without resetting the ProfilingManager would fail. 156 /// \param tree_adapter pointer the adapter that owns the ExecutionTree 157 /// \return Status the status code returned 158 Status RegisterTree(const TreeAdapter *tree_adapter); 159 160 /// Reset the ProfilingManager. This method is sued when we want to profile another tree in the same process. 161 /// \return Status the status code returned 162 Status Reset(); 163 164 // Save profile data to file 165 // @param dir_path_ The path to the directory where the profiling data will be saved. 166 // @return Status The status code returned 167 Status SaveProfilingData(const std::string &dir_path, const std::string &rank_id); 168 169 // Sampling node getter 170 // @param name - The name of the requested node 171 // @param node - Pointer to the shared pointer for the Sampling node 172 // @return Status The status code returned 173 Status GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node); 174 175 // Tracing node getter 176 // @param name - The name of the requested node 177 // @param node - Pointer to the shared pointer for the Tracing node 178 // @return Status The status code returned 179 Status GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node); 180 181 // return true if enabled_ is set to true, namely if Init() has been called successfully 182 // @param tree - Execution tree pointer 183 bool IsProfilingEnable(const ExecutionTree *tree = nullptr) const; 184 185 // Record end of epoch information 186 // @param step_num - The number of steps 187 void RecordEndOfEpoch(uint32_t step_num); 188 GetSamplingNodes()189 const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() const { return sampling_nodes_; } 190 191 // Launch monitoring thread. 192 Status LaunchMonitor(); 193 194 // @return Status The status code returned 195 Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id); 196 197 #ifndef ENABLE_ANDROID 198 /// \brief API to get User CPU utilization for the system 199 /// \param [in] epoch_num The epoch number for which results are requested 200 /// \param [out] result A vector with the sampled User CPU Utilization for the entire system 201 /// \return Status object with the error code 202 Status GetUserCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result); 203 204 /// \brief API to get User CPU utilization for the system 205 /// \param [in] start_step The step interval start range 206 /// \param [in] end_step The step interval end range 207 /// \param [out] result A vector with the sampled User CPU Utilization for the entire system 208 /// \return Status object with the error code 209 Status GetUserCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result); 210 211 /// \brief API to get User CPU utilization for the system 212 /// \param [in] start_ts The time interval start range in ms 213 /// \param [in] end_ts The time interval end range in ms 214 /// \param [out] result A vector with the sampled User CPU Utilization for the entire system 215 /// \return Status object with the error code 216 Status GetUserCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result); 217 218 /// \brief API to get System CPU utilization for the system 219 /// \param [in] epoch_num The epoch number for which results are requested 220 /// \param [out] result A vector with the sampled System CPU Utilization for the entire system 221 /// \return Status object with the error code 222 Status GetSysCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result); 223 224 /// \brief API to get System CPU utilization for the system 225 /// \param [in] start_step The step interval start range 226 /// \param [in] end_step The step interval end range 227 /// \param [out] result A vector with the sampled System CPU Utilization for the entire system 228 /// \return Status object with the error code 229 Status GetSysCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result); 230 231 /// \brief API to get System CPU utilization for the system 232 /// \param [in] start_ts The time interval start range in ms 233 /// \param [in] end_ts The time interval end range in ms 234 /// \param [out] result A vector with the sampled System CPU Utilization for the entire system 235 /// \return Status object with the error code 236 Status GetSysCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result); 237 238 /// \brief API to get User CPU Utilization of an MD operator 239 /// \param [in] op_id The id of the operator 240 /// \param [in] epoch_num The epoch number for which results are requested 241 /// \param [out] result A vector with the sampled User CPU Utilization of the operator. 242 /// \return Status object with the error code 243 Status GetUserCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result); 244 245 /// \brief API to get User CPU Utilization of an MD operator 246 /// \param [in] op_id The id of the operator 247 /// \param [in] start_step The step interval start range 248 /// \param [in] end_step The step interval end range 249 /// \param [out] result A vector with the sampled User CPU Utilization of the operator. 250 /// \return Status object with the error code 251 Status GetUserCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result); 252 253 /// \brief API to get User CPU Utilization of an MD operator 254 /// \param [in] op_id The id of the operator 255 /// \param [in] start_ts The time interval start range in ms 256 /// \param [in] end_ts The time interval end range in ms 257 /// \param [out] result A vector with the sampled User CPU Utilization of the operator. 258 /// \return Status object with the error code 259 Status GetUserCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result); 260 261 /// \brief API to get System CPU Utilization of an MD operator 262 /// \param [in] op_id The id of the operator 263 /// \param [in] epoch_num The epoch number for which results are requested 264 /// \param [out] result A vector with the sampled System CPU Utilization of the operator. 265 /// \return Status object with the error code 266 Status GetSysCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result); 267 268 /// \brief API to get System CPU Utilization of an MD operator 269 /// \param [in] op_id The id of the operator 270 /// \param [in] start_step The step interval start range 271 /// \param [in] end_step The step interval end range 272 /// \param [out] result A vector with the sampled System CPU Utilization of the operator. 273 /// \return Status object with the error code 274 Status GetSysCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result); 275 276 /// \brief API to get System CPU Utilization of an MD operator 277 /// \param [in] op_id The id of the operator 278 /// \param [in] start_ts The time interval start range in ms 279 /// \param [in] end_ts The time interval end range in ms 280 /// \param [out] result A vector with the sampled System CPU Utilization of the operator. 281 /// \return Status object with the error code 282 Status GetSysCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result); 283 284 /// \brief API to get information on main process memory usage 285 /// \param [in] metric The requested memory set usage. One of these values: 286 /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage 287 /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of 288 /// shared libraries 289 /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries) 290 /// \param [in] epoch_num The epoch number for which results are requested 291 /// \param [out] result The desired value in MB 292 /// \return Status object with the error code 293 Status GetMainProcessMemoryInfoByEpoch(ProcessMemoryMetric metric, int32_t epoch_num, std::vector<float> *result); 294 295 /// \brief API to get information on main process memory usage 296 /// \param [in] metric The requested memory set usage. One of these values: 297 /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage 298 /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of 299 /// shared libraries 300 /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries) 301 /// \param [in] end_ts The time interval end range in ms 302 /// \param [in] start_step The step interval start range 303 /// \param [in] end_step The step interval end range 304 /// \param [out] result The desired value in MB 305 /// \return Status object with the error code 306 Status GetMainProcessMemoryInfoByStep(ProcessMemoryMetric metric, int32_t start_step, int32_t end_step, 307 std::vector<float> *result); 308 309 /// \brief API to get information on main process memory usage 310 /// \param [in] metric The requested memory set usage. One of these values: 311 /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage 312 /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of 313 /// shared libraries 314 /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries) 315 /// \param [in] start_ts The time interval start range in ms 316 /// \param [in] end_ts The time interval end range in ms 317 /// \param [out] result The desired value in MB 318 /// \return Status object with the error code 319 Status GetMainProcessMemoryInfoByTime(ProcessMemoryMetric metric, uint64_t start_ts, uint64_t end_ts, 320 std::vector<float> *result); 321 322 /// \brief API to get information on system memory usage 323 /// \param [in] metric The requested memory metric. One of these values: 324 /// - SystemMemoryMetric::kMemoryAvailable 325 /// - SystemMemoryMetric::kMemoryTotal 326 /// - SystemMemoryMetric::kMemoryUsed 327 /// \param [in] epoch_num The epoch number for which results are requested 328 /// \param [out] result The desired value in MB 329 /// \return Status object with the error code 330 Status GetSystemMemoryInfoByEpoch(SystemMemoryMetric metric, int32_t epoch_num, std::vector<float> *result); 331 332 /// \brief API to get information on system memory usage 333 /// \param [in] metric The requested memory metric. One of these values: 334 /// - SystemMemoryMetric::kMemoryAvailable 335 /// - SystemMemoryMetric::kMemoryTotal 336 /// - SystemMemoryMetric::kMemoryUsed 337 /// \param [in] start_step The step interval start range 338 /// \param [in] end_step The step interval end range 339 /// \param [out] result The desired value in MB 340 /// \return Status object with the error code 341 Status GetSystemMemoryInfoByStep(SystemMemoryMetric metric, int32_t start_step, int32_t end_step, 342 std::vector<float> *result); 343 344 /// \brief API to get information on system memory usage 345 /// \param [in] metric The requested memory metric. One of these values: 346 /// - SystemMemoryMetric::kMemoryAvailable 347 /// - SystemMemoryMetric::kMemoryTotal 348 /// - SystemMemoryMetric::kMemoryUsed 349 /// \param [in] start_ts The time interval start range in ms 350 /// \param [in] end_ts The time interval end range in ms 351 /// \param [out] result The desired value in MB 352 /// \return Status object with the error code 353 Status GetSystemMemoryInfoByTime(SystemMemoryMetric metric, uint64_t start_ts, uint64_t end_ts, 354 std::vector<float> *result); 355 #endif 356 357 /// \brief API to get the connector size of an MD operator 358 /// \param [in] op_id The id of the operator 359 /// \param [in] epoch_num The epoch number for which results are requested 360 /// \param [out] result A vector with the sampled connector sizes of the operator 361 /// \return Status object with the error code 362 Status GetConnectorSizeByEpoch(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result); 363 364 /// \brief API to get the connector size of an MD operator 365 /// \param [in] op_id The id of the operator 366 /// \param [in] start_step The step interval start range 367 /// \param [in] end_step The step interval end range 368 /// \param [out] result A vector with the sampled connector sizes of the operator 369 /// \return Status object with the error code 370 Status GetConnectorSizeByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 371 372 /// \brief API to get the connector size of an MD operator 373 /// \param [in] op_id The id of the operator 374 /// \param [in] start_ts The time interval start range in ms 375 /// \param [in] end_ts The time interval end range in ms 376 /// \param [out] result A vector with the sampled connector sizes of the operator 377 /// \return Status object with the error code 378 Status GetConnectorSizeByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 379 380 /// \brief API to get the connector size of DatasetIterator or DataQueueOp 381 /// \param [in] epoch_num The epoch number for which results are requested 382 /// \param [out] result A vector with connector size at each step 383 /// \return Status object with the error code 384 Status GetConnectorSizeByEpoch(int32_t epoch_num, std::vector<int32_t> *result); 385 386 /// \brief API to get the connector size of DatasetIterator or DataQueueOp 387 /// \param [in] start_step The step interval start range 388 /// \param [in] end_step The step interval end range 389 /// \param [out] result A vector with connector size at each step 390 /// \return Status object with the error code 391 Status GetConnectorSizeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 392 393 /// \brief API to get the connector size of DatasetIterator or DataQueueOp 394 /// \param [in] start_ts The time interval start range in ms 395 /// \param [in] end_ts The time interval end range in ms 396 /// \param [out] result A vector with connector size at each step 397 /// \return Status object with the error code 398 Status GetConnectorSizeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 399 400 /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp 401 /// \param [in] epoch_num The epoch number for which results are requested 402 /// \param [out] result A vector with connector capacity at each step 403 /// \return Status object with the error code 404 Status GetConnectorCapacityByEpoch(int32_t epoch_num, std::vector<int32_t> *result); 405 406 /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp 407 /// \param [in] start_step The step interval start range 408 /// \param [in] end_step The step interval end range 409 /// \param [out] result A vector with connector capacity at each step 410 /// \return Status object with the error code 411 Status GetConnectorCapacityByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 412 413 /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp 414 /// \param [in] start_ts The time interval start range in ms 415 /// \param [in] end_ts The time interval end range in ms 416 /// \param [out] result A vector with connector capacity for steps in the given time range 417 /// \return Status object with the error code 418 Status GetConnectorCapacityByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 419 420 /// \brief API to get the pipeline time of batches 421 /// \param [in] epoch_num The epoch number for which results are requested 422 /// \param [out] result A vector with the pipeline time for each step 423 /// \return Status object with the error code 424 Status GetPipelineTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result); 425 426 /// \brief API to get the pipeline time of batches 427 /// \param [in] start_step The step interval start range 428 /// \param [in] end_step The step interval end range 429 /// \param [out] result A vector with the pipeline time for each step 430 /// \return Status object with the error code 431 Status GetPipelineTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 432 433 /// \brief API to get the pipeline time of batches 434 /// \param [in] start_ts The time interval start range in ms 435 /// \param [in] end_ts The time interval end range in ms 436 /// \param [out] result A vector with the pipeline time for steps in the given time range 437 /// \return Status object with the error code 438 Status GetPipelineTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 439 440 /// \brief API to get the push time of batches 441 /// \param [in] epoch_num The epoch number for which results are requested 442 /// \param [out] result A vector with the push time for each each step 443 /// \return Status object with the error code 444 Status GetPushTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result); 445 446 /// \brief API to get the push time of batches 447 /// \param [in] start_step The step interval start range 448 /// \param [in] end_step The step interval end range 449 /// \param [out] result A vector with the push time for each each step 450 /// \return Status object with the error code 451 Status GetPushTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 452 453 /// \brief API to get the push time of batches 454 /// \param [in] start_ts The time interval start range in ms 455 /// \param [in] end_ts The time interval end range in ms 456 /// \param [out] result A vector with the push time for steps in the given time range 457 /// \return Status object with the error code 458 Status GetPushTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 459 460 /// \brief API to get the batch time of batches 461 /// \param [in] epoch_num The epoch number for which results are requested 462 /// \param [out] result A vector with the batch time for each step 463 /// \return Status object with the error code 464 Status GetBatchTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result); 465 466 /// \brief API to get the batch time of batches 467 /// \param [in] start_step The step interval start range 468 /// \param [in] end_step The step interval end range 469 /// \param [out] result A vector with the batch time for each step 470 /// \return Status object with the error code 471 Status GetBatchTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result); 472 473 /// \brief API to get the batch time of batches 474 /// \param [in] start_ts The time interval start range in ms 475 /// \param [in] end_ts The time interval end range in ms 476 /// \param [out] result A vector with the batch time for steps in the given time range 477 /// \return Status object with the error code 478 Status GetBatchTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result); 479 480 /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty 481 /// \param [in] epoch_num The epoch number for which results are requested 482 /// \param [out] result The empty queue frequency 483 /// \return Status object with the error code 484 Status GetEmptyQueueFrequencyByEpoch(int32_t epoch_num, float_t *result); 485 486 /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty 487 /// \param [in] start_step The step interval start range 488 /// \param [in] end_step The step interval end range 489 /// \param [out] result The empty queue frequency 490 /// \return Status object with the error code 491 Status GetEmptyQueueFrequencyByStep(int32_t start_step, int32_t end_step, float_t *result); 492 493 /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty 494 /// \param [in] start_ts The time interval start range in ms 495 /// \param [in] end_ts The time interval end range in ms 496 /// \param [out] result The empty queue frequency 497 /// \return Status object with the error code 498 Status GetEmptyQueueFrequencyByTime(uint64_t start_ts, uint64_t end_ts, float_t *result); 499 500 // Register profile node to tree 501 // @param node - Profiling node 502 // @return Status The status code returned 503 Status RegisterTracingNode(const std::shared_ptr<Tracing> &node); 504 505 /// \brief API to initialize profiling manager 506 /// \param for_autotune flag to indicate if Profiler is initialized for autotuning or profiling purposes 507 /// \return Status object with the error code 508 Status Init(const bool for_autotune = false); 509 510 /// \brief API to signal the profiling nodes to start collecting data 511 /// \return Status object with the error code 512 Status Start(); 513 514 /// \brief API to signal profiling nodes to stop collecting data 515 /// \return Status object with the error code 516 Status Stop(); 517 518 /// \brief API to save to file all the collected data between Start and Stop calls 519 /// \return Status object with the error code 520 Status Save(const std::string &profile_data_path); 521 522 /// \brief Helper to get the rank id. Currently being used for appending rank id to files 523 /// \return String The rank id 524 std::string GetRankID() const; 525 526 /// Get number of epochs that have been already profiled 527 /// \return number of epochs GetNumOfProfiledEpochs()528 int32_t GetNumOfProfiledEpochs() const { return static_cast<int32_t>(epoch_end_step_.size()) - 1; } 529 530 // Get number of steps taken in pipeline 531 /// \return number of steps 532 Status GetNumberOfProfiledSteps(int32_t *steps); 533 534 /// Determine if the Profiler is being used for autotuning. 535 /// \return boolean IsAutotuning()536 bool IsAutotuning() const { return autotuning_; } 537 538 /// Determine if the Profiler is being used for profiling. 539 /// \return boolean IsProfiling()540 bool IsProfiling() const { return profiling_; } 541 542 // Registration state for the profiler 543 enum ProfilingRegistrationState { 544 kNotEnabled, 545 kEnabledTreeNotRegistered, 546 kEnabledTreeRegistered, 547 kEnabledDifferentTreeRegistered, 548 }; 549 550 /// \brief Getter for the profiling and tree registration state 551 /// \param tree Execution Tree pointer 552 /// \return ProfilingRegistrationState 553 ProfilingRegistrationState GetProfilerTreeState(const ExecutionTree *tree) const; 554 555 protected: 556 std::unique_ptr<Monitor> perf_monitor_; 557 558 // State flags for profiling 559 enum ProfilingState { 560 kProfilingStateUnBegun, 561 kProfilingStateRunning, 562 kProfilingStateFinished, 563 }; 564 ProfilingState profiling_state_; // show current state of ProfilingManager (running, or paused) 565 std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_; 566 std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_; 567 ExecutionTree *tree_; // ExecutionTree pointer 568 std::vector<uint64_t> epoch_end_ts_; // End of epoch timestamp 569 std::vector<uint32_t> epoch_end_step_; // End of epoch step number 570 std::atomic<bool> autotuning_; // flag to indicate if ProfilingManager is being used for auto-tuning the pipeline 571 std::atomic<bool> profiling_; // flag to indicate if ProfilingManager is being used for profiling the pipeline 572 573 // Register profile node to tree 574 // @param node - Profiling node 575 // @return Status The status code returned 576 Status RegisterSamplingNode(const std::shared_ptr<Sampling> &node); 577 578 /// \brief Helper to convert a given epoch number to a step interval 579 /// \param [in] epoch_num The epoch number to be converted 580 /// \param [out] start_step The corresponding start step for the given epoch 581 /// \param [out] end_step The corresponding end step for the given epoch 582 /// \return Status object with the error code 583 Status EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step); 584 585 /// \brief Helper to convert a given epoch number to a time interval 586 /// \param [in] epoch_num The epoch number to be converted 587 /// \param [out] start_ts The corresponding starting timestamp in ms for the given epoch 588 /// \param [out] end_ts The corresponding ending timestamp in ms for the given epoch 589 /// \return Status object with the error code 590 Status EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts); 591 592 /// \brief Helper to convert step interval to a time interval 593 /// \param [in] start_step The step interval start range 594 /// \param [in] end_step The step interval end range 595 /// \param [out] start_ts The corresponding starting timestamp in ms for the given step interval 596 /// \param [out] end_ts The corresponding ending timestamp in ms for the given step interval 597 /// \return Status object with the error code 598 Status StepToTimeInterval(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts); 599 600 /// \brief Helper to convert time interval to a step interval 601 /// \param [in] start_ts The time interval start range in ms 602 /// \param [in] end_ts The time interval end range in ms 603 /// \param [out] start_step The corresponding start step for the given time interval 604 /// \param [out] end_step The corresponding end step for the given time interval 605 /// \return Status object with the error code 606 Status TimeToStepInterval(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step); 607 }; 608 609 enum ProfilingType { TIME, CONNECTOR_DEPTH }; 610 611 enum ProfilingTimeSubType { 612 PIPELINE_TIME, 613 TDT_PUSH_TIME, 614 BATCH_TIME, 615 INVALID_TIME, 616 }; 617 618 class ProfilingTime { 619 public: 620 static uint64_t GetCurMilliSecond(); 621 }; 622 } // namespace dataset 623 } // namespace mindspore 624 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_ 625