• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
18 
19 #include "absl/container/flat_hash_map.h"
20 #include "tensorflow/core/framework/dataset_options.pb.h"
21 #include "tensorflow/core/lib/monitoring/counter.h"
22 #include "tensorflow/core/lib/monitoring/gauge.h"
23 #include "tensorflow/core/platform/statusor.h"
24 #include "tensorflow/core/platform/types.h"
25 
26 namespace tensorflow {
27 namespace metrics {
28 
29 // Records that a tf.data.Dataset executed by the program used autotuning.
30 //
31 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
32 void RecordTFDataAutotune(const string& name);
33 
34 // Returns a counter that can be used to record the number of bytes produced by
35 // a tf.data.Dataset.
36 //
37 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
38 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
39 
40 // Returns a counter that can be used to record the number of bytes produced by
41 // a tf.data.Dataset.
42 //
43 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
44 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
45 
46 // Returns a counter than can be used to record the number of bytes read from
47 // the filesystem by a tf.data.Dataset source.
48 //
49 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
50 //
51 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
52 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
53 
54 // Returns a counter than can be used to record the number of elements produced
55 // by a tf.data.Dataset.
56 //
57 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
58 monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
59 
60 // Returns a gauge than can be used to record the performance model information.
61 //
62 // The `id` argument represents the (unique) model ID.
63 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
64     const string& id);
65 
66 // Records the number of bytes fetched from tf.data.Dataset iterator.
67 void RecordTFDataBytesFetched(int64_t num_bytes);
68 
69 // Records the number of times tf.data experiment is applied to input pipelines.
70 void RecordTFDataExperiment(const string& name);
71 
72 // Records the time (in microseconds) spent in a single invocation of
73 // `ItertatorResource::GetNext()`.
74 void RecordTFDataGetNextDuration(uint64 duration_us);
75 
76 // Records the number of times each tf.data fingerprint is used
77 // to measure duplicate pre-processing.
78 //
79 // The `name` argument identifies the Dataset graph fingerprint,
80 // created using GraphHash().
81 void RecordTFDataFingerprint(const string& name);
82 
83 // Records the time (in microseconds) during which `IteratorResource` was busy
84 // processing at least one `GetNext()` request.
85 void RecordTFDataIteratorBusy(uint64 duration_us);
86 
87 // Records the time (in microseconds) between `IteratorResource` receiving the
88 // first `GetNext()` request and responding to the last `GetNext()` request.
89 void RecordTFDataIteratorLifetime(uint64 duration_us);
90 
91 // Records the number of independent graph changes resulting from the
92 // application of a tf.data optimization.
93 //
94 // The `name` argument identifies the optimization (e.g. "noop_elimination").
95 void RecordTFDataOptimization(const string& name, int64_t num_changes);
96 
97 // Records that a tf.data service worker has been created.
98 void RecordTFDataServiceWorkerCreated();
99 
100 // Records the file name read by a tf.data Dataset.
101 //
102 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
103 void RecordTFDataFilename(const string& name, const string& filename);
104 
105 // Records statistics of tf.data auto sharding.
106 //
107 // The `id` is a unique identifier of the input pipeline. The `policy`
108 // identifies the auto-sharding policy used, the `num_workers` identifies the
109 // number of workers, and `num_replicas` identifies the number of replicas.
110 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
111                            int64 num_workers, int64 num_replicas);
112 
113 // Records parsing of dense tensor features.
114 void RecordParseDenseFeature(int64_t num_features);
115 
116 // Records parsing of sparse tensor features.
117 void RecordParseSparseFeature(int64_t num_features);
118 
119 // Records parsing of ragged tensor features.
120 void RecordParseRaggedFeature(int64_t num_features);
121 
122 // Records the size of input/output tensors in bytes.
123 void RecordGraphInputTensors(const size_t size);
124 void RecordGraphOutputTensors(const size_t size);
125 
126 // Records the number of cores requested by graphs with XLA SPMD enabled.
127 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
128 
129 void UpdateGraphExecTime(const uint64 running_time_usecs);
130 void UpdateGraphPendingQueueLength(uint64 len);
131 
132 // Records that one output of an op of type `op_name` was unused.
133 void RecordUnusedOutput(const string& op_name);
134 
135 // Updates the metrics stored about time spent building graphs.
136 //
137 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
138 // the full graph, induced by a set of options. In particular, these options
139 // include the feeds and fetches requested.
140 //
141 // This includes time spent:
142 //   * optimizing the graphs with Grappler
143 //   * pruning the sub-graph (unless the place_pruned_graph option is set)
144 //
145 // When executing eagerly, this will not record any activity.
146 //
147 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
148 void UpdateGraphBuildTime(const uint64 running_time_usecs);
149 
150 // Updates the metrics stored about graph optimizations.
151 void UpdateGraphOptimizationPassTime(const string& pass_name,
152                                      const uint64 running_time_usecs);
153 void UpdateGrapplerPassTime(const string& pass_name,
154                             const uint64 running_time_usecs);
155 
156 // Updates metrics for time to distribute variables to all TPU hosts.
157 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
158 
159 // Updates the metrics stored about time XLA spents compiling graphs.
160 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
161 
162 // Updates the metrics stored about time BFC allocator spents during delay.
163 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
164 
165 }  // namespace metrics
166 }  // namespace tensorflow
167 
168 #endif  // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
169