• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_
18 
19 #include <atomic>
20 #include <chrono>
21 #include <memory>
22 #include <mutex>
23 #include <string>
24 #include <unordered_map>
25 #include <vector>
26 #include <nlohmann/json.hpp>
27 #include "minddata/dataset/util/path.h"
28 #include "minddata/dataset/util/status.h"
29 #include "minddata/dataset/engine/perf/monitor.h"
30 
31 namespace mindspore {
32 namespace dataset {
33 
34 class Monitor;
35 class ExecutionTree;
36 class TreeConsumer;
37 class CpuSampler;
38 class TreeAdapter;
39 
40 const char kDeviceQueueTracingName[] = "Device_Queue_Tracing";
41 const char kDatasetIteratorTracingName[] = "Dataset_Iterator_Tracing";
42 const char kConnectorSizeSamplingName[] = "Connector_Size_Sampling";
43 const char kCpuSamplerName[] = "Cpu_Sampler";
44 
45 // Values for process memory metrics - common for profiling and cpu_sampler
46 enum ProcessMemoryMetric { kPSS, kRSS, kVSS };
47 
48 // Values for system memory metrics - common for profiling and cpu_sampler
49 enum SystemMemoryMetric { kMemoryAvailable, kMemoryTotal, kMemoryUsed };
50 
51 // Profiling is a class of basic unit of profiling action
52 // This base class encapsulate the serialization output logic
53 class Profiling : public std::enable_shared_from_this<Profiling> {
54  public:
55   // Constructor
Profiling()56   Profiling() : active_(false) {}
57 
58   // Destructor
59   virtual ~Profiling() = default;
60 
61   virtual Status Init() = 0;
62 
63   // Default serialization file generator
64   virtual Status SaveToFile(const std::string &dir_path, const std::string &rank_id) = 0;
65 
66   // Profiling name
67   virtual std::string Name() const = 0;
68 
69   virtual Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) = 0;
70 
71   // Start collecting data
72   Status Start();
73 
74   // Stop collecting data
75   Status Stop();
76 
77   // Clear all collected data
78   virtual void Clear() = 0;
79 
80  protected:
81   bool active_;  // show current state of ProfilingManager (running, or paused)
82   std::mutex lock_;
83   virtual Path GetFileName(const std::string &dir_path, const std::string &rank_id) = 0;
84 };
85 
86 // Sampling is a class of profiling which generate samples periodically.
87 class Sampling : public Profiling {
88  public:
89   // Sampling action function. This function will be invoked by performance monitor thread.
90   virtual Status Sample() = 0;
91 
92   ~Sampling() override = default;
93 };
94 
95 typedef struct TracingRecord_s {
96   int32_t type;
97   int32_t extra_info;
98   int32_t batch_num;
99   int32_t value;
100   uint64_t ts;
101 
ToStringTracingRecord_s102   std::string ToString() const {
103     return std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
104            std::to_string(value) + " " + std::to_string(ts);
105   }
106 } TracingRecord;
107 
108 // Tracing is class of profiling which record samples upon request.
109 class Tracing : public Profiling {
110  public:
111   // Tracing has minimal interface to provide flexible on data recording.
112   // It only includes some common routines.
113   Status SaveToFile(const std::string &dir_path, const std::string &rank_id) override;
114   Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) override;
115   Status Init() override;
116   Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
117   Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
118   Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
119   Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
120   Status GetConnectorCapacity(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
121   Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq);
122   void Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
123               const uint64_t time_stamp);
124   Status TimeIntervalForStepRange(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts);
125   Status StepIntervalForTimeRange(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step);
126   size_t GetNumberSteps();
127 
128   // Clear all collected data
129   void Clear() override;
130 
131  protected:
132   Tracing() = default;
133   std::vector<std::string> value_;
134   std::vector<TracingRecord> records_;
135   std::vector<uint64_t> ts_;  // End time of each step or batch
136   Status GetRecordEntryFieldValue(int32_t start_step, int32_t end_step, int32_t record_offset, const std::string &field,
137                                   std::vector<int32_t> *result);
138 };
139 
140 // ProfilingManager is a class manages all profiling infrastructure
141 // It serves the following purposes:
142 // 1) Fetch profiling configs from global contexts
143 // 2) Setup all profiling node based on config
144 // 3) Provide access of profiling nodes for profiling actions
145 // 4) Manage profiling data serialization process
146 class ProfilingManager {
147   friend Monitor;
148 
149  public:
150   ProfilingManager();
151 
152   ~ProfilingManager() = default;
153 
154   /// Register the given tree to be profiled.
155   /// This method should be called once, calling it for another tree without resetting the ProfilingManager would fail.
156   /// \param tree_adapter pointer the adapter that owns the ExecutionTree
157   /// \return Status the status code returned
158   Status RegisterTree(const TreeAdapter *tree_adapter);
159 
160   /// Reset the ProfilingManager. This method is sued when we want to profile another tree in the same process.
161   /// \return Status the status code returned
162   Status Reset();
163 
164   // Save profile data to file
165   // @param dir_path_ The path to the directory where the profiling data will be saved.
166   // @return Status The status code returned
167   Status SaveProfilingData(const std::string &dir_path, const std::string &rank_id);
168 
169   // Sampling node getter
170   // @param name - The name of the requested node
171   // @param node - Pointer to the shared pointer for the Sampling node
172   // @return Status The status code returned
173   Status GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node);
174 
175   // Tracing node getter
176   // @param name - The name of the requested node
177   // @param node - Pointer to the shared pointer for the Tracing node
178   // @return Status The status code returned
179   Status GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node);
180 
181   // return true if enabled_ is set to true, namely if Init() has been called successfully
182   // @param tree - Execution tree pointer
183   bool IsProfilingEnable(const ExecutionTree *tree = nullptr) const;
184 
185   // Record end of epoch information
186   // @param step_num - The number of steps
187   void RecordEndOfEpoch(uint32_t step_num);
188 
GetSamplingNodes()189   const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() const { return sampling_nodes_; }
190 
191   // Launch monitoring thread.
192   Status LaunchMonitor();
193 
194   // @return Status The status code returned
195   Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id);
196 
197 #ifndef ENABLE_ANDROID
198   /// \brief API to get User CPU utilization for the system
199   /// \param [in] epoch_num The epoch number for which results are requested
200   /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
201   /// \return Status object with the error code
202   Status GetUserCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result);
203 
204   /// \brief API to get User CPU utilization for the system
205   /// \param [in] start_step The step interval start range
206   /// \param [in] end_step The step interval end range
207   /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
208   /// \return Status object with the error code
209   Status GetUserCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result);
210 
211   /// \brief API to get User CPU utilization for the system
212   /// \param [in] start_ts The time interval start range in ms
213   /// \param [in] end_ts The time interval end range in ms
214   /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
215   /// \return Status object with the error code
216   Status GetUserCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
217 
218   /// \brief API to get System CPU utilization for the system
219   /// \param [in] epoch_num The epoch number for which results are requested
220   /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
221   /// \return Status object with the error code
222   Status GetSysCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result);
223 
224   /// \brief API to get System CPU utilization for the system
225   /// \param [in] start_step The step interval start range
226   /// \param [in] end_step The step interval end range
227   /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
228   /// \return Status object with the error code
229   Status GetSysCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result);
230 
231   /// \brief API to get System CPU utilization for the system
232   /// \param [in] start_ts The time interval start range in ms
233   /// \param [in] end_ts The time interval end range in ms
234   /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
235   /// \return Status object with the error code
236   Status GetSysCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
237 
238   /// \brief API to get User CPU Utilization of an MD operator
239   /// \param [in] op_id The id of the operator
240   /// \param [in] epoch_num The epoch number for which results are requested
241   /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
242   /// \return Status object with the error code
243   Status GetUserCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);
244 
245   /// \brief API to get User CPU Utilization of an MD operator
246   /// \param [in] op_id The id of the operator
247   /// \param [in] start_step The step interval start range
248   /// \param [in] end_step The step interval end range
249   /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
250   /// \return Status object with the error code
251   Status GetUserCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result);
252 
253   /// \brief API to get User CPU Utilization of an MD operator
254   /// \param [in] op_id The id of the operator
255   /// \param [in] start_ts The time interval start range in ms
256   /// \param [in] end_ts The time interval end range in ms
257   /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
258   /// \return Status object with the error code
259   Status GetUserCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);
260 
261   /// \brief API to get System CPU Utilization of an MD operator
262   /// \param [in] op_id The id of the operator
263   /// \param [in] epoch_num The epoch number for which results are requested
264   /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
265   /// \return Status object with the error code
266   Status GetSysCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);
267 
268   /// \brief API to get System CPU Utilization of an MD operator
269   /// \param [in] op_id The id of the operator
270   /// \param [in] start_step The step interval start range
271   /// \param [in] end_step The step interval end range
272   /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
273   /// \return Status object with the error code
274   Status GetSysCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result);
275 
276   /// \brief API to get System CPU Utilization of an MD operator
277   /// \param [in] op_id The id of the operator
278   /// \param [in] start_ts The time interval start range in ms
279   /// \param [in] end_ts The time interval end range in ms
280   /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
281   /// \return Status object with the error code
282   Status GetSysCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);
283 
284   /// \brief API to get information on main process memory usage
285   /// \param [in] metric The requested memory set usage.  One of these values:
286   ///     - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
287   ///     - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
288   ///     shared libraries
289   ///     - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
290   /// \param [in] epoch_num The epoch number for which results are requested
291   /// \param [out] result The desired value in MB
292   /// \return Status object with the error code
293   Status GetMainProcessMemoryInfoByEpoch(ProcessMemoryMetric metric, int32_t epoch_num, std::vector<float> *result);
294 
295   /// \brief API to get information on main process memory usage
296   /// \param [in] metric The requested memory set usage.  One of these values:
297   ///     - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
298   ///     - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
299   ///     shared libraries
300   ///     - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
301   /// \param [in] end_ts The time interval end range in ms
302   /// \param [in] start_step The step interval start range
303   /// \param [in] end_step The step interval end range
304   /// \param [out] result The desired value in MB
305   /// \return Status object with the error code
306   Status GetMainProcessMemoryInfoByStep(ProcessMemoryMetric metric, int32_t start_step, int32_t end_step,
307                                         std::vector<float> *result);
308 
309   /// \brief API to get information on main process memory usage
310   /// \param [in] metric The requested memory set usage.  One of these values:
311   ///     - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
312   ///     - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
313   ///     shared libraries
314   ///     - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
315   /// \param [in] start_ts The time interval start range in ms
316   /// \param [in] end_ts The time interval end range in ms
317   /// \param [out] result The desired value in MB
318   /// \return Status object with the error code
319   Status GetMainProcessMemoryInfoByTime(ProcessMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
320                                         std::vector<float> *result);
321 
322   /// \brief API to get information on system memory usage
323   /// \param [in] metric The requested memory metric.  One of these values:
324   ///     - SystemMemoryMetric::kMemoryAvailable
325   ///     - SystemMemoryMetric::kMemoryTotal
326   ///     - SystemMemoryMetric::kMemoryUsed
327   /// \param [in] epoch_num The epoch number for which results are requested
328   /// \param [out] result The desired value in MB
329   /// \return Status object with the error code
330   Status GetSystemMemoryInfoByEpoch(SystemMemoryMetric metric, int32_t epoch_num, std::vector<float> *result);
331 
332   /// \brief API to get information on system memory usage
333   /// \param [in] metric The requested memory metric.  One of these values:
334   ///     - SystemMemoryMetric::kMemoryAvailable
335   ///     - SystemMemoryMetric::kMemoryTotal
336   ///     - SystemMemoryMetric::kMemoryUsed
337   /// \param [in] start_step The step interval start range
338   /// \param [in] end_step The step interval end range
339   /// \param [out] result The desired value in MB
340   /// \return Status object with the error code
341   Status GetSystemMemoryInfoByStep(SystemMemoryMetric metric, int32_t start_step, int32_t end_step,
342                                    std::vector<float> *result);
343 
344   /// \brief API to get information on system memory usage
345   /// \param [in] metric The requested memory metric.  One of these values:
346   ///     - SystemMemoryMetric::kMemoryAvailable
347   ///     - SystemMemoryMetric::kMemoryTotal
348   ///     - SystemMemoryMetric::kMemoryUsed
349   /// \param [in] start_ts The time interval start range in ms
350   /// \param [in] end_ts The time interval end range in ms
351   /// \param [out] result The desired value in MB
352   /// \return Status object with the error code
353   Status GetSystemMemoryInfoByTime(SystemMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
354                                    std::vector<float> *result);
355 #endif
356 
357   /// \brief API to get the connector size of an MD operator
358   /// \param [in] op_id The id of the operator
359   /// \param [in] epoch_num The epoch number for which results are requested
360   /// \param [out] result A vector with the sampled connector sizes of the operator
361   /// \return Status object with the error code
362   Status GetConnectorSizeByEpoch(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result);
363 
364   /// \brief API to get the connector size of an MD operator
365   /// \param [in] op_id The id of the operator
366   /// \param [in] start_step The step interval start range
367   /// \param [in] end_step The step interval end range
368   /// \param [out] result A vector with the sampled connector sizes of the operator
369   /// \return Status object with the error code
370   Status GetConnectorSizeByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
371 
372   /// \brief API to get the connector size of an MD operator
373   /// \param [in] op_id The id of the operator
374   /// \param [in] start_ts The time interval start range in ms
375   /// \param [in] end_ts The time interval end range in ms
376   /// \param [out] result A vector with the sampled connector sizes of the operator
377   /// \return Status object with the error code
378   Status GetConnectorSizeByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
379 
380   /// \brief API to get the connector size of DatasetIterator or DataQueueOp
381   /// \param [in] epoch_num The epoch number for which results are requested
382   /// \param [out] result A vector with connector size at each step
383   /// \return Status object with the error code
384   Status GetConnectorSizeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
385 
386   /// \brief API to get the connector size of DatasetIterator or DataQueueOp
387   /// \param [in] start_step The step interval start range
388   /// \param [in] end_step The step interval end range
389   /// \param [out] result A vector with connector size at each step
390   /// \return Status object with the error code
391   Status GetConnectorSizeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
392 
393   /// \brief API to get the connector size of DatasetIterator or DataQueueOp
394   /// \param [in] start_ts The time interval start range in ms
395   /// \param [in] end_ts The time interval end range in ms
396   /// \param [out] result A vector with connector size at each step
397   /// \return Status object with the error code
398   Status GetConnectorSizeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
399 
400   /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp
401   /// \param [in] epoch_num The epoch number for which results are requested
402   /// \param [out] result A vector with connector capacity at each step
403   /// \return Status object with the error code
404   Status GetConnectorCapacityByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
405 
406   /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp
407   /// \param [in] start_step The step interval start range
408   /// \param [in] end_step The step interval end range
409   /// \param [out] result A vector with connector capacity at each step
410   /// \return Status object with the error code
411   Status GetConnectorCapacityByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
412 
413   /// \brief API to get the connector capacity of DatasetIterator or DataQueueOp
414   /// \param [in] start_ts The time interval start range in ms
415   /// \param [in] end_ts The time interval end range in ms
416   /// \param [out] result A vector with connector capacity for steps in the given time range
417   /// \return Status object with the error code
418   Status GetConnectorCapacityByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
419 
420   /// \brief API to get the pipeline time of batches
421   /// \param [in] epoch_num The epoch number for which results are requested
422   /// \param [out] result A vector with the pipeline time for each step
423   /// \return Status object with the error code
424   Status GetPipelineTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
425 
426   /// \brief API to get the pipeline time of batches
427   /// \param [in] start_step The step interval start range
428   /// \param [in] end_step The step interval end range
429   /// \param [out] result A vector with the pipeline time for each step
430   /// \return Status object with the error code
431   Status GetPipelineTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
432 
433   /// \brief API to get the pipeline time of batches
434   /// \param [in] start_ts The time interval start range in ms
435   /// \param [in] end_ts The time interval end range in ms
436   /// \param [out] result A vector with the pipeline time for steps in the given time range
437   /// \return Status object with the error code
438   Status GetPipelineTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
439 
440   /// \brief API to get the push time of batches
441   /// \param [in] epoch_num The epoch number for which results are requested
442   /// \param [out] result A vector with the push time for each each step
443   /// \return Status object with the error code
444   Status GetPushTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
445 
446   /// \brief API to get the push time of batches
447   /// \param [in] start_step The step interval start range
448   /// \param [in] end_step The step interval end range
449   /// \param [out] result A vector with the push time for each each step
450   /// \return Status object with the error code
451   Status GetPushTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
452 
453   /// \brief API to get the push time of batches
454   /// \param [in] start_ts The time interval start range in ms
455   /// \param [in] end_ts The time interval end range in ms
456   /// \param [out] result A vector with the push time for steps in the given time range
457   /// \return Status object with the error code
458   Status GetPushTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
459 
460   /// \brief API to get the batch time of batches
461   /// \param [in] epoch_num The epoch number for which results are requested
462   /// \param [out] result A vector with the batch time for each step
463   /// \return Status object with the error code
464   Status GetBatchTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
465 
466   /// \brief API to get the batch time of batches
467   /// \param [in] start_step The step interval start range
468   /// \param [in] end_step The step interval end range
469   /// \param [out] result A vector with the batch time for each step
470   /// \return Status object with the error code
471   Status GetBatchTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
472 
473   /// \brief API to get the batch time of batches
474   /// \param [in] start_ts The time interval start range in ms
475   /// \param [in] end_ts The time interval end range in ms
476   /// \param [out] result A vector with the batch time for steps in the given time range
477   /// \return Status object with the error code
478   Status GetBatchTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
479 
480   /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty
481   /// \param [in] epoch_num The epoch number for which results are requested
482   /// \param [out] result The empty queue frequency
483   /// \return Status object with the error code
484   Status GetEmptyQueueFrequencyByEpoch(int32_t epoch_num, float_t *result);
485 
486   /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty
487   /// \param [in] start_step The step interval start range
488   /// \param [in] end_step The step interval end range
489   /// \param [out] result The empty queue frequency
490   /// \return Status object with the error code
491   Status GetEmptyQueueFrequencyByStep(int32_t start_step, int32_t end_step, float_t *result);
492 
493   /// \brief API to get fraction of steps that DatasetIterator or DataQueueOp connector was empty
494   /// \param [in] start_ts The time interval start range in ms
495   /// \param [in] end_ts The time interval end range in ms
496   /// \param [out] result The empty queue frequency
497   /// \return Status object with the error code
498   Status GetEmptyQueueFrequencyByTime(uint64_t start_ts, uint64_t end_ts, float_t *result);
499 
500   // Register profile node to tree
501   // @param node - Profiling node
502   // @return Status The status code returned
503   Status RegisterTracingNode(const std::shared_ptr<Tracing> &node);
504 
505   /// \brief API to initialize profiling manager
506   /// \param for_autotune flag to indicate if Profiler is initialized for autotuning or profiling purposes
507   /// \return Status object with the error code
508   Status Init(const bool for_autotune = false);
509 
510   /// \brief API to signal the profiling nodes to start collecting data
511   /// \return Status object with the error code
512   Status Start();
513 
514   /// \brief API to signal profiling nodes to stop collecting data
515   /// \return Status object with the error code
516   Status Stop();
517 
518   /// \brief API to save to file all the collected data between Start and Stop calls
519   /// \return Status object with the error code
520   Status Save(const std::string &profile_data_path);
521 
522   /// \brief Helper to get the rank id. Currently being used for appending rank id to files
523   /// \return String The rank id
524   std::string GetRankID() const;
525 
526   /// Get number of epochs that have been already profiled
527   /// \return number of epochs
GetNumOfProfiledEpochs()528   int32_t GetNumOfProfiledEpochs() const { return static_cast<int32_t>(epoch_end_step_.size()) - 1; }
529 
530   // Get number of steps taken in pipeline
531   /// \return number of steps
532   Status GetNumberOfProfiledSteps(int32_t *steps);
533 
534   /// Determine if the Profiler is being used for autotuning.
535   /// \return boolean
IsAutotuning()536   bool IsAutotuning() const { return autotuning_; }
537 
538   /// Determine if the Profiler is being used for profiling.
539   /// \return boolean
IsProfiling()540   bool IsProfiling() const { return profiling_; }
541 
542   // Registration state for the profiler
543   enum ProfilingRegistrationState {
544     kNotEnabled,
545     kEnabledTreeNotRegistered,
546     kEnabledTreeRegistered,
547     kEnabledDifferentTreeRegistered,
548   };
549 
550   /// \brief Getter for the profiling and tree registration state
551   /// \param tree Execution Tree pointer
552   /// \return ProfilingRegistrationState
553   ProfilingRegistrationState GetProfilerTreeState(const ExecutionTree *tree) const;
554 
555  protected:
556   std::unique_ptr<Monitor> perf_monitor_;
557 
558   // State flags for profiling
559   enum ProfilingState {
560     kProfilingStateUnBegun,
561     kProfilingStateRunning,
562     kProfilingStateFinished,
563   };
564   ProfilingState profiling_state_;  // show current state of ProfilingManager (running, or paused)
565   std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_;
566   std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
567   ExecutionTree *tree_;                   // ExecutionTree pointer
568   std::vector<uint64_t> epoch_end_ts_;    // End of epoch timestamp
569   std::vector<uint32_t> epoch_end_step_;  // End of epoch step number
570   std::atomic<bool> autotuning_;  // flag to indicate if ProfilingManager is being used for auto-tuning the pipeline
571   std::atomic<bool> profiling_;   // flag to indicate if ProfilingManager is being used for profiling the pipeline
572 
573   // Register profile node to tree
574   // @param node - Profiling node
575   // @return Status The status code returned
576   Status RegisterSamplingNode(const std::shared_ptr<Sampling> &node);
577 
578   /// \brief Helper to convert a given epoch number to a step interval
579   /// \param [in] epoch_num The epoch number to be converted
580   /// \param [out] start_step The corresponding start step for the given epoch
581   /// \param [out] end_step The corresponding end step for the given epoch
582   /// \return Status object with the error code
583   Status EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step);
584 
585   /// \brief Helper to convert a given epoch number to a time interval
586   /// \param [in] epoch_num The epoch number to be converted
587   /// \param [out] start_ts The corresponding starting timestamp in ms for the given epoch
588   /// \param [out] end_ts The corresponding ending timestamp in ms for the given epoch
589   /// \return Status object with the error code
590   Status EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts);
591 
592   /// \brief Helper to convert step interval to a time interval
593   /// \param [in] start_step The step interval start range
594   /// \param [in] end_step The step interval end range
595   /// \param [out] start_ts The corresponding starting timestamp in ms for the given step interval
596   /// \param [out] end_ts The corresponding ending timestamp in ms for the given step interval
597   /// \return Status object with the error code
598   Status StepToTimeInterval(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts);
599 
600   /// \brief Helper to convert time interval to a step interval
601   /// \param [in] start_ts The time interval start range in ms
602   /// \param [in] end_ts The time interval end range in ms
603   /// \param [out] start_step The corresponding start step for the given time interval
604   /// \param [out] end_step The corresponding end step for the given time interval
605   /// \return Status object with the error code
606   Status TimeToStepInterval(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step);
607 };
608 
609 enum ProfilingType { TIME, CONNECTOR_DEPTH };
610 
611 enum ProfilingTimeSubType {
612   PIPELINE_TIME,
613   TDT_PUSH_TIME,
614   BATCH_TIME,
615   INVALID_TIME,
616 };
617 
618 class ProfilingTime {
619  public:
620   static uint64_t GetCurMilliSecond();
621 };
622 }  // namespace dataset
623 }  // namespace mindspore
624 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_
625