1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/framework/metrics.h"
17
18 #include "absl/strings/str_cat.h"
19 #include "tensorflow/core/lib/monitoring/counter.h"
20 #include "tensorflow/core/lib/monitoring/gauge.h"
21 #include "tensorflow/core/lib/monitoring/sampler.h"
22
23 namespace tensorflow {
24 namespace metrics {
25 namespace {
26
27 auto* graph_runs = monitoring::Counter<0>::New(
28 "/tensorflow/core/graph_runs",
29 "The number of graph executions used to collect "
30 "/tensorflow/core/graph_run_time_usecs");
31
32 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
33 "/tensorflow/core/graph_run_time_usecs",
34 "The total time spent on executing graphs in microseconds.");
35
36 auto* graph_optimization_usecs =
37 monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
38 "The total time spent running each graph "
39 "optimization pass in microseconds.",
40 "kind", "name");
41
42 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
43 {"/tensorflow/core/graph_run_time_usecs_histogram",
44 "The wall-clock time spent on executing graphs in microseconds."},
45 // Power of 2 with bucket count 20 (> 17 minutes)
46 {monitoring::Buckets::Exponential(1000, 2, 20)});
47
48 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
49 {"/tensorflow/core/graph_pending_queue_length_histogram",
50 "The number of pending (ready but not running) tasks in graph executor."},
51 // Power of 1.5 with bucket count 30 (> 191k)
52 {monitoring::Buckets::Exponential(1, 1.5, 30)});
53
54 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
55 {"/tensorflow/core/graph_run_input_tensor_bytes",
56 "The size of input tensors in bytes."},
57 // Power of 2 with bucket count 14 (256MB)
58 {monitoring::Buckets::Exponential(1, 4, 14)});
59
60 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
61 {"/tensorflow/core/graph_run_output_tensor_bytes",
62 "The size of output tensors in bytes."},
63 // Power of 2 with bucket count 14 (256MB)
64 {monitoring::Buckets::Exponential(1, 4, 14)});
65
66 auto* graph_unused_outputs = monitoring::Counter<1>::New(
67 "/tensorflow/core/graph_unused_outputs",
68 "The number of unused outputs for ops of a given type.", "name");
69
70 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
71 "/tensorflow/data/autotune", "tf.data autotuning", "name");
72
73 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
74 "/tensorflow/data/bytes_consumed",
75 "The number of bytes consumed by a tf.data Dataset.", "name");
76
77 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
78 "/tensorflow/data/bytes_produced",
79 "The number of bytes produced by a tf.data Dataset.", "name");
80
81 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
82 "/tensorflow/data/bytes_read",
83 "The number of bytes read by tf.data Dataset sources.", "name");
84
85 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
86 "/tensorflow/data/bytes_fetched",
87 "The number of bytes fetched from tf.data Dataset iterator.");
88
89 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
90 "/tensorflow/data/elements", "tf.data elements", "name");
91
92 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
93 "/tensorflow/data/experiment",
94 "The number of times tf.data experiment is applied to input pipelines.",
95 "name");
96
97 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
98 "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
99
100 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
101 {"/tensorflow/data/getnext_duration",
102 "Microseconds spent fetching an element from tf.data iterator."},
103 // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
104 {monitoring::Buckets::Explicit(
105 {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
106
107 auto* tf_data_iterator_busy_counter =
108 monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
109 "The time (in microseconds) during which a "
110 "tf.data iterator was busy processing at "
111 "least one `GetNext()` request.");
112
113 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
114 "/tensorflow/data/iterator_lifetime",
115 "The time (in microseconds) between a tf.data iterator receiving the first "
116 "`GetNext()` request and responding to the last `GetNext()` request.");
117
118 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
119 "/tensorflow/data/optimization", "tf.data optimization", "name");
120
121 auto* tf_data_service_workers_created_counter =
122 monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
123 "Number of tf.data service workers created");
124
125 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
126 "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
127 "name", "filename");
128
129 auto* tf_data_model_gauge =
130 monitoring::Gauge<std::function<std::string()>, 1>::New(
131 "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
132
133 auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
134 "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
135 "name");
136
137 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
138 "/tensorflow/data/dense_feature",
139 "The number of dense features parsed by ops for parsing tf.Example.");
140
141 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
142 "/tensorflow/data/sparse_feature",
143 "The number of sparse features parsed by ops for parsing tf.Example.");
144
145 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
146 "/tensorflow/data/ragged_feature",
147 "The number of ragged features parsed by ops for parsing tf.Example.");
148
149 auto* build_graph_calls = monitoring::Counter<0>::New(
150 "/tensorflow/core/graph_build_calls",
151 "The number of times TensorFlow has created a new client graph. "
152 "A client graph is a sub-graph of the full graph, induced by a set of "
153 "options, including the requested feeds and fetches. It includes time "
154 "spent optimizing the graph with Grappler, and time spent pruning the "
155 "sub-graph.");
156
157 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
158 "/tensorflow/core/graph_build_time_usecs",
159 "The amount of time TensorFlow has spent creating new client graphs in "
160 "microseconds. "
161 "A client graph is a sub-graph of the full graph, induced by a set of "
162 "options, including the requested feeds and fetches. It includes time "
163 "spent optimizing the graph with Grappler, and time spent pruning the "
164 "sub-graph.");
165
166 auto* xla_compilations = monitoring::Counter<0>::New(
167 "/tensorflow/core/xla_compilations",
168 "The number of XLA compilations used to collect "
169 "/tensorflow/core/xla_compilation_time_usecs");
170
171 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
172 "/tensorflow/core/xla_compilation_time_usecs",
173 "The total time spent on compiling XLA graphs in microseconds.");
174
175 auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
176 "/tensorflow/tpu/xla_spmd_cores_per_replica",
177 "The number of cores used by XLA SPMD-replicated models.", "cores");
178
179 auto* bfc_allocator_delay =
180 monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
181 "The total time spent running each graph "
182 "optimization pass in microseconds.");
183
184 auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
185 "/tensorflow/tpu/variable_distribution_time",
186 "Time spent sending variables from primary task to other worker tasks "
187 "at the start of a call to TPUExecute. Timer starts at RunGraph "
188 "invocation and ends when TPUExecute args are ready on the current task.");
189
190 } // namespace
191
RecordTFDataAutotune(const string & name)192 void RecordTFDataAutotune(const string& name) {
193 tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
194 }
195
GetTFDataBytesConsumedCounter(const string & name)196 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
197 return tf_data_bytes_consumed_counter->GetCell(name);
198 }
199
GetTFDataBytesProducedCounter(const string & name)200 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
201 return tf_data_bytes_produced_counter->GetCell(name);
202 }
203
GetTFDataBytesReadCounter(const string & name)204 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
205 return tf_data_bytes_read_counter->GetCell(name);
206 }
207
GetTFDataElementsCounter(const string & name)208 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
209 return tf_data_elements_counter->GetCell(name);
210 }
211
GetTFDataModelGauge(const string & id)212 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
213 const string& id) {
214 return tf_data_model_gauge->GetCell(id);
215 }
216
RecordTFDataBytesFetched(int64_t num_bytes)217 void RecordTFDataBytesFetched(int64_t num_bytes) {
218 tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
219 }
220
RecordTFDataExperiment(const string & name)221 void RecordTFDataExperiment(const string& name) {
222 tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
223 }
224
RecordTFDataFingerprint(const string & name)225 void RecordTFDataFingerprint(const string& name) {
226 tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
227 }
228
RecordTFDataGetNextDuration(uint64 duration_us)229 void RecordTFDataGetNextDuration(uint64 duration_us) {
230 static auto* tf_data_get_next_duration_cell =
231 tf_data_get_next_duration_usecs_histogram->GetCell();
232 tf_data_get_next_duration_cell->Add(duration_us);
233 }
234
RecordTFDataIteratorBusy(uint64 duration_us)235 void RecordTFDataIteratorBusy(uint64 duration_us) {
236 static auto* tf_data_iterator_busy_cell =
237 tf_data_iterator_busy_counter->GetCell();
238 tf_data_iterator_busy_cell->IncrementBy(duration_us);
239 }
240
RecordTFDataIteratorLifetime(uint64 duration_us)241 void RecordTFDataIteratorLifetime(uint64 duration_us) {
242 static auto* tf_data_iterator_lifetime_cell =
243 tf_data_iterator_lifetime_counter->GetCell();
244 tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
245 }
246
RecordTFDataOptimization(const string & name,int64_t num_changes)247 void RecordTFDataOptimization(const string& name, int64_t num_changes) {
248 tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
249 }
250
RecordTFDataServiceWorkerCreated()251 void RecordTFDataServiceWorkerCreated() {
252 tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
253 }
254
RecordTFDataFilename(const string & name,const string & filename)255 void RecordTFDataFilename(const string& name, const string& filename) {
256 tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
257 }
258
RecordTFDataAutoShard(const string & id,data::AutoShardPolicy policy,int64 num_workers,int64 num_replicas)259 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
260 int64 num_workers, int64 num_replicas) {
261 tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64>(policy));
262 tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
263 tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
264 }
265
RecordParseDenseFeature(int64 num_features)266 void RecordParseDenseFeature(int64 num_features) {
267 static auto* parse_dense_feature_counter_cell =
268 parse_dense_feature_counter->GetCell();
269 parse_dense_feature_counter_cell->IncrementBy(num_features);
270 }
271
RecordParseSparseFeature(int64_t num_features)272 void RecordParseSparseFeature(int64_t num_features) {
273 static auto* parse_sparse_feature_counter_cell =
274 parse_sparse_feature_counter->GetCell();
275 parse_sparse_feature_counter_cell->IncrementBy(num_features);
276 }
277
RecordParseRaggedFeature(int64_t num_features)278 void RecordParseRaggedFeature(int64_t num_features) {
279 static auto* parse_ragged_feature_counter_cell =
280 parse_ragged_feature_counter->GetCell();
281 parse_ragged_feature_counter_cell->IncrementBy(num_features);
282 }
283
RecordGraphInputTensors(const size_t size)284 void RecordGraphInputTensors(const size_t size) {
285 static auto* graph_run_input_tensor_bytes_cell =
286 graph_run_input_tensor_bytes->GetCell();
287 graph_run_input_tensor_bytes_cell->Add(size);
288 }
289
RecordGraphOutputTensors(const size_t size)290 void RecordGraphOutputTensors(const size_t size) {
291 static auto* graph_run_output_tensor_bytes_cell =
292 graph_run_output_tensor_bytes->GetCell();
293 graph_run_output_tensor_bytes_cell->Add(size);
294 }
295
RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica)296 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
297 xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
298 ->IncrementBy(1);
299 }
300
UpdateGraphExecTime(const uint64 running_time_usecs)301 void UpdateGraphExecTime(const uint64 running_time_usecs) {
302 if (running_time_usecs > 0) {
303 static auto* graph_runs_cell = graph_runs->GetCell();
304 static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
305 static auto* graph_run_time_usecs_histogram_cell =
306 graph_run_time_usecs_histogram->GetCell();
307 graph_runs_cell->IncrementBy(1);
308 graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
309 graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
310 }
311 }
312
UpdateGraphPendingQueueLength(uint64 len)313 void UpdateGraphPendingQueueLength(uint64 len) {
314 static auto* graph_pending_queue_length_cell =
315 graph_pending_queue_length_histogram->GetCell();
316 graph_pending_queue_length_cell->Add(len);
317 }
318
UpdateGraphOptimizationPassTime(const string & pass_name,const uint64 running_time_usecs)319 void UpdateGraphOptimizationPassTime(const string& pass_name,
320 const uint64 running_time_usecs) {
321 if (running_time_usecs > 0) {
322 graph_optimization_usecs->GetCell("GraphOptimizationPass", pass_name)
323 ->IncrementBy(running_time_usecs);
324 }
325 }
326
UpdateGrapplerPassTime(const string & pass_name,const uint64 running_time_usecs)327 void UpdateGrapplerPassTime(const string& pass_name,
328 const uint64 running_time_usecs) {
329 if (running_time_usecs > 0) {
330 graph_optimization_usecs->GetCell("Grappler", pass_name)
331 ->IncrementBy(running_time_usecs);
332 }
333 }
334
UpdateGraphBuildTime(const uint64 running_time_usecs)335 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
336 if (running_time_usecs > 0) {
337 static auto* build_graph_calls_cell = build_graph_calls->GetCell();
338 static auto* build_graph_time_usecs_cell =
339 build_graph_time_usecs->GetCell();
340 build_graph_calls_cell->IncrementBy(1);
341 build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
342 }
343 }
344
UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs)345 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
346 if (distribution_time_usecs > 0) {
347 tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
348 distribution_time_usecs);
349 }
350 }
351
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)352 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
353 if (compilation_time_usecs > 0) {
354 static auto* xla_compilations_cell = xla_compilations->GetCell();
355 static auto* xla_compilation_time_usecs_cell =
356 xla_compilation_time_usecs->GetCell();
357 xla_compilations_cell->IncrementBy(1);
358 xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
359 }
360 }
361
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)362 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
363 static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
364 if (delay_usecs > 0) {
365 bfc_allocator_delay_cell->IncrementBy(delay_usecs);
366 }
367 }
368
RecordUnusedOutput(const string & op_name)369 void RecordUnusedOutput(const string& op_name) {
370 graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
371 }
372
373 } // namespace metrics
374 } // namespace tensorflow
375