1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/framework/metrics.h"
17 #include "tensorflow/core/lib/monitoring/counter.h"
18 #include "tensorflow/core/lib/monitoring/sampler.h"
19
20 namespace tensorflow {
21 namespace metrics {
22 namespace {
23
24 auto* graph_runs = monitoring::Counter<0>::New(
25 "/tensorflow/core/graph_runs",
26 "The number of graph executions used to collect "
27 "/tensorflow/core/graph_run_time_usecs");
28
29 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
30 "/tensorflow/core/graph_run_time_usecs",
31 "The total time spent on executing graphs in microseconds.");
32
33 auto* graph_optimization_usecs =
34 monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
35 "The total time spent running each graph "
36 "optimization pass in microseconds.",
37 "kind", "name");
38
39 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
40 {"/tensorflow/core/graph_run_time_usecs_histogram",
41 "The wall-clock time spent on executing graphs in microseconds."},
42 // Power of 2 with bucket count 20 (> 17 minutes)
43 {monitoring::Buckets::Exponential(1000, 2, 20)});
44
45 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
46 {"/tensorflow/core/graph_pending_queue_length_histogram",
47 "The number of pending (ready but not running) tasks in graph executor."},
48 // Power of 1.5 with bucket count 30 (> 191k)
49 {monitoring::Buckets::Exponential(1, 1.5, 30)});
50
51 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
52 {"/tensorflow/core/graph_run_input_tensor_bytes",
53 "The size of input tensors in bytes."},
54 // Power of 2 with bucket count 14 (256MB)
55 {monitoring::Buckets::Exponential(1, 4, 14)});
56
57 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
58 {"/tensorflow/core/graph_run_output_tensor_bytes",
59 "The size of output tensors in bytes."},
60 // Power of 2 with bucket count 14 (256MB)
61 {monitoring::Buckets::Exponential(1, 4, 14)});
62
63 auto* graph_unused_outputs = monitoring::Counter<1>::New(
64 "/tensorflow/core/graph_unused_outputs",
65 "The number of unused outputs for ops of a given type.", "name");
66
67 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
68 "/tensorflow/data/autotune", "tf.data autotuning", "name");
69
70 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
71 "/tensorflow/data/bytes_consumed",
72 "The number of bytes consumed by a tf.data Dataset.", "name");
73
74 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
75 "/tensorflow/data/bytes_produced",
76 "The number of bytes produced by a tf.data Dataset.", "name");
77
78 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
79 "/tensorflow/data/bytes_read",
80 "The number of bytes read by tf.data Dataset sources.", "name");
81
82 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
83 "/tensorflow/data/bytes_fetched",
84 "The number of bytes fetched from tf.data Dataset iterator.");
85
86 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
87 "/tensorflow/data/elements", "tf.data elements", "name");
88
89 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
90 "/tensorflow/data/experiment",
91 "The number of times tf.data experiment is applied to input pipelines.",
92 "name");
93
94 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
95 "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
96
97 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
98 {"/tensorflow/data/getnext_duration",
99 "Microseconds spent fetching an element from tf.data iterator."},
100 // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
101 {monitoring::Buckets::Explicit(
102 {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
103
104 auto* tf_data_iterator_busy_counter =
105 monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
106 "The time (in microseconds) during which a "
107 "tf.data iterator was busy processing at "
108 "least one `GetNext()` request.");
109
110 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
111 "/tensorflow/data/iterator_lifetime",
112 "The time (in microseconds) between a tf.data iterator receiving the first "
113 "`GetNext()` request and responding to the last `GetNext()` request.");
114
115 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
116 "/tensorflow/data/optimization", "tf.data optimization", "name");
117
118 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
119 "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
120 "name", "filename");
121
122 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
123 "/tensorflow/data/dense_feature",
124 "The number of dense features parsed by ops for parsing tf.Example.");
125
126 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
127 "/tensorflow/data/sparse_feature",
128 "The number of sparse features parsed by ops for parsing tf.Example.");
129
130 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
131 "/tensorflow/data/ragged_feature",
132 "The number of ragged features parsed by ops for parsing tf.Example.");
133
134 auto* build_graph_calls = monitoring::Counter<0>::New(
135 "/tensorflow/core/graph_build_calls",
136 "The number of times TensorFlow has created a new client graph. "
137 "A client graph is a sub-graph of the full graph, induced by a set of "
138 "options, including the requested feeds and fetches. It includes time "
139 "spent optimizing the graph with Grappler, and time spent pruning the "
140 "sub-graph.");
141
142 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
143 "/tensorflow/core/graph_build_time_usecs",
144 "The amount of time TensorFlow has spent creating new client graphs in "
145 "microseconds. "
146 "A client graph is a sub-graph of the full graph, induced by a set of "
147 "options, including the requested feeds and fetches. It includes time "
148 "spent optimizing the graph with Grappler, and time spent pruning the "
149 "sub-graph.");
150
151 auto* xla_compilations = monitoring::Counter<0>::New(
152 "/tensorflow/core/xla_compilations",
153 "The number of XLA compilations used to collect "
154 "/tensorflow/core/xla_compilation_time_usecs");
155
156 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
157 "/tensorflow/core/xla_compilation_time_usecs",
158 "The total time spent on compiling XLA graphs in microseconds.");
159
160 auto* mlir_import_failure_count = monitoring::Counter<0>::New(
161 "/tensorflow/mlir/import_failure_count",
162 "The number of jobs that failed during mlir import or verification.");
163
164 auto* bfc_allocator_delay =
165 monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
166 "The total time spent running each graph "
167 "optimization pass in microseconds.");
168
169 } // namespace
170
RecordTFDataAutotune(const string & name)171 void RecordTFDataAutotune(const string& name) {
172 tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
173 }
174
GetTFDataBytesConsumedCounter(const string & name)175 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
176 return tf_data_bytes_consumed_counter->GetCell(name);
177 }
178
GetTFDataBytesProducedCounter(const string & name)179 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
180 return tf_data_bytes_produced_counter->GetCell(name);
181 }
182
GetTFDataBytesReadCounter(const string & name)183 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
184 return tf_data_bytes_read_counter->GetCell(name);
185 }
186
GetTFDataElementsCounter(const string & name)187 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
188 return tf_data_elements_counter->GetCell(name);
189 }
190
RecordTFDataBytesFetched(int64 num_bytes)191 void RecordTFDataBytesFetched(int64 num_bytes) {
192 tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
193 }
194
RecordTFDataExperiment(const string & name)195 void RecordTFDataExperiment(const string& name) {
196 tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
197 }
198
RecordTFDataFingerprint(const string & name)199 void RecordTFDataFingerprint(const string& name) {
200 tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
201 }
202
RecordTFDataGetNextDuration(uint64 duration_us)203 void RecordTFDataGetNextDuration(uint64 duration_us) {
204 static auto* tf_data_get_next_duration_cell =
205 tf_data_get_next_duration_usecs_histogram->GetCell();
206 tf_data_get_next_duration_cell->Add(duration_us);
207 }
208
RecordTFDataIteratorBusy(uint64 duration_us)209 void RecordTFDataIteratorBusy(uint64 duration_us) {
210 static auto* tf_data_iterator_busy_cell =
211 tf_data_iterator_busy_counter->GetCell();
212 tf_data_iterator_busy_cell->IncrementBy(duration_us);
213 }
214
RecordTFDataIteratorLifetime(uint64 duration_us)215 void RecordTFDataIteratorLifetime(uint64 duration_us) {
216 static auto* tf_data_iterator_lifetime_cell =
217 tf_data_iterator_lifetime_counter->GetCell();
218 tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
219 }
220
RecordTFDataOptimization(const string & name,int64 num_changes)221 void RecordTFDataOptimization(const string& name, int64 num_changes) {
222 tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
223 }
224
RecordTFDataFilename(const string & name,const string & filename)225 void RecordTFDataFilename(const string& name, const string& filename) {
226 tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
227 }
228
RecordParseDenseFeature(int64 num_features)229 void RecordParseDenseFeature(int64 num_features) {
230 static auto* parse_dense_feature_counter_cell =
231 parse_dense_feature_counter->GetCell();
232 parse_dense_feature_counter_cell->IncrementBy(num_features);
233 }
234
RecordParseSparseFeature(int64 num_features)235 void RecordParseSparseFeature(int64 num_features) {
236 static auto* parse_sparse_feature_counter_cell =
237 parse_sparse_feature_counter->GetCell();
238 parse_sparse_feature_counter_cell->IncrementBy(num_features);
239 }
240
RecordParseRaggedFeature(int64 num_features)241 void RecordParseRaggedFeature(int64 num_features) {
242 static auto* parse_ragged_feature_counter_cell =
243 parse_ragged_feature_counter->GetCell();
244 parse_ragged_feature_counter_cell->IncrementBy(num_features);
245 }
246
RecordGraphInputTensors(const size_t size)247 void RecordGraphInputTensors(const size_t size) {
248 static auto* graph_run_input_tensor_bytes_cell =
249 graph_run_input_tensor_bytes->GetCell();
250 graph_run_input_tensor_bytes_cell->Add(size);
251 }
252
RecordGraphOutputTensors(const size_t size)253 void RecordGraphOutputTensors(const size_t size) {
254 static auto* graph_run_output_tensor_bytes_cell =
255 graph_run_output_tensor_bytes->GetCell();
256 graph_run_output_tensor_bytes_cell->Add(size);
257 }
258
UpdateGraphExecTime(const uint64 running_time_usecs)259 void UpdateGraphExecTime(const uint64 running_time_usecs) {
260 if (running_time_usecs > 0) {
261 static auto* graph_runs_cell = graph_runs->GetCell();
262 static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
263 static auto* graph_run_time_usecs_histogram_cell =
264 graph_run_time_usecs_histogram->GetCell();
265 graph_runs_cell->IncrementBy(1);
266 graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
267 graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
268 }
269 }
270
UpdateGraphPendingQueueLength(uint64 len)271 void UpdateGraphPendingQueueLength(uint64 len) {
272 static auto* graph_pending_queue_length_cell =
273 graph_pending_queue_length_histogram->GetCell();
274 graph_pending_queue_length_cell->Add(len);
275 }
276
UpdateGraphOptimizationPassTime(const string & pass_name,const uint64 running_time_usecs)277 void UpdateGraphOptimizationPassTime(const string& pass_name,
278 const uint64 running_time_usecs) {
279 if (running_time_usecs > 0) {
280 graph_optimization_usecs->GetCell("GraphOptimizationPass", pass_name)
281 ->IncrementBy(running_time_usecs);
282 }
283 }
284
UpdateGrapplerPassTime(const string & pass_name,const uint64 running_time_usecs)285 void UpdateGrapplerPassTime(const string& pass_name,
286 const uint64 running_time_usecs) {
287 if (running_time_usecs > 0) {
288 graph_optimization_usecs->GetCell("Grappler", pass_name)
289 ->IncrementBy(running_time_usecs);
290 }
291 }
292
UpdateGraphBuildTime(const uint64 running_time_usecs)293 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
294 if (running_time_usecs > 0) {
295 static auto* build_graph_calls_cell = build_graph_calls->GetCell();
296 static auto* build_graph_time_usecs_cell =
297 build_graph_time_usecs->GetCell();
298 build_graph_calls_cell->IncrementBy(1);
299 build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
300 }
301 }
302
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)303 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
304 if (compilation_time_usecs > 0) {
305 static auto* xla_compilations_cell = xla_compilations->GetCell();
306 static auto* xla_compilation_time_usecs_cell =
307 xla_compilation_time_usecs->GetCell();
308 xla_compilations_cell->IncrementBy(1);
309 xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
310 }
311 }
312
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)313 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
314 static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
315 if (delay_usecs > 0) {
316 bfc_allocator_delay_cell->IncrementBy(delay_usecs);
317 }
318 }
319
IncrementMLIRImportFailureCount()320 void IncrementMLIRImportFailureCount() {
321 static auto* mlir_import_failure_count_cell =
322 mlir_import_failure_count->GetCell();
323 mlir_import_failure_count_cell->IncrementBy(1);
324 }
325
RecordUnusedOutput(const string & op_name)326 void RecordUnusedOutput(const string& op_name) {
327 graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
328 }
329
330 } // namespace metrics
331 } // namespace tensorflow
332