1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/framework/metrics.h"
17 
18 #include "absl/strings/str_cat.h"
19 #include "tensorflow/core/lib/monitoring/counter.h"
20 #include "tensorflow/core/lib/monitoring/gauge.h"
21 #include "tensorflow/core/lib/monitoring/sampler.h"
22 
23 namespace tensorflow {
24 namespace metrics {
25 namespace {
26 
27 auto* graph_runs = monitoring::Counter<0>::New(
28     "/tensorflow/core/graph_runs",
29     "The number of graph executions used to collect "
30     "/tensorflow/core/graph_run_time_usecs");
31 
32 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
33     "/tensorflow/core/graph_run_time_usecs",
34     "The total time spent on executing graphs in microseconds.");
35 
36 auto* graph_optimization_usecs =
37     monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
38                                 "The total time spent running each graph "
39                                 "optimization pass in microseconds.",
40                                 "kind", "name");
41 
42 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
43     {"/tensorflow/core/graph_run_time_usecs_histogram",
44      "The wall-clock time spent on executing graphs in microseconds."},
45     // Power of 2 with bucket count 20 (> 17 minutes)
46     {monitoring::Buckets::Exponential(1000, 2, 20)});
47 
48 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
49     {"/tensorflow/core/graph_pending_queue_length_histogram",
50      "The number of pending (ready but not running) tasks in graph executor."},
51     // Power of 1.5 with bucket count 30 (> 191k)
52     {monitoring::Buckets::Exponential(1, 1.5, 30)});
53 
54 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
55     {"/tensorflow/core/graph_run_input_tensor_bytes",
56      "The size of input tensors in bytes."},
57     // Power of 2 with bucket count 14 (256MB)
58     {monitoring::Buckets::Exponential(1, 4, 14)});
59 
60 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
61     {"/tensorflow/core/graph_run_output_tensor_bytes",
62      "The size of output tensors in bytes."},
63     // Power of 2 with bucket count 14 (256MB)
64     {monitoring::Buckets::Exponential(1, 4, 14)});
65 
66 auto* graph_unused_outputs = monitoring::Counter<1>::New(
67     "/tensorflow/core/graph_unused_outputs",
68     "The number of unused outputs for ops of a given type.", "name");
69 
70 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
71     "/tensorflow/data/autotune", "tf.data autotuning", "name");
72 
73 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
74     "/tensorflow/data/bytes_consumed",
75     "The number of bytes consumed by a tf.data Dataset.", "name");
76 
77 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
78     "/tensorflow/data/bytes_produced",
79     "The number of bytes produced by a tf.data Dataset.", "name");
80 
81 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
82     "/tensorflow/data/bytes_read",
83     "The number of bytes read by tf.data Dataset sources.", "name");
84 
85 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
86     "/tensorflow/data/bytes_fetched",
87     "The number of bytes fetched from tf.data Dataset iterator.");
88 
89 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
90     "/tensorflow/data/elements", "tf.data elements", "name");
91 
92 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
93     "/tensorflow/data/experiment",
94     "The number of times tf.data experiment is applied to input pipelines.",
95     "name");
96 
97 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
98     "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
99 
100 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
101     {"/tensorflow/data/getnext_duration",
102      "Microseconds spent fetching an element from tf.data iterator."},
103     // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
104     {monitoring::Buckets::Explicit(
105         {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
106 
107 auto* tf_data_iterator_busy_counter =
108     monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
109                                 "The time (in microseconds) during which a "
110                                 "tf.data iterator was busy processing at "
111                                 "least one `GetNext()` request.");
112 
113 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
114     "/tensorflow/data/iterator_lifetime",
115     "The time (in microseconds) between a tf.data iterator receiving the first "
116     "`GetNext()` request and responding to the last `GetNext()` request.");
117 
118 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
119     "/tensorflow/data/optimization", "tf.data optimization", "name");
120 
121 auto* tf_data_service_workers_created_counter =
122     monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
123                                 "Number of tf.data service workers created");
124 
125 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
126     "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
127     "name", "filename");
128 
129 auto* tf_data_model_gauge =
130     monitoring::Gauge<std::function<std::string()>, 1>::New(
131         "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
132 
133 auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
134     "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
135     "name");
136 
137 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
138     "/tensorflow/data/dense_feature",
139     "The number of dense features parsed by ops for parsing tf.Example.");
140 
141 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
142     "/tensorflow/data/sparse_feature",
143     "The number of sparse features parsed by ops for parsing tf.Example.");
144 
145 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
146     "/tensorflow/data/ragged_feature",
147     "The number of ragged features parsed by ops for parsing tf.Example.");
148 
149 auto* build_graph_calls = monitoring::Counter<0>::New(
150     "/tensorflow/core/graph_build_calls",
151     "The number of times TensorFlow has created a new client graph. "
152     "A client graph is a sub-graph of the full graph, induced by a set of "
153     "options, including the requested feeds and fetches. It includes time "
154     "spent optimizing the graph with Grappler, and time spent pruning the "
155     "sub-graph.");
156 
157 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
158     "/tensorflow/core/graph_build_time_usecs",
159     "The amount of time TensorFlow has spent creating new client graphs in "
160     "microseconds. "
161     "A client graph is a sub-graph of the full graph, induced by a set of "
162     "options, including the requested feeds and fetches. It includes time "
163     "spent optimizing the graph with Grappler, and time spent pruning the "
164     "sub-graph.");
165 
166 auto* xla_compilations = monitoring::Counter<0>::New(
167     "/tensorflow/core/xla_compilations",
168     "The number of XLA compilations used to collect "
169     "/tensorflow/core/xla_compilation_time_usecs");
170 
171 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
172     "/tensorflow/core/xla_compilation_time_usecs",
173     "The total time spent on compiling XLA graphs in microseconds.");
174 
175 auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
176     "/tensorflow/tpu/xla_spmd_cores_per_replica",
177     "The number of cores used by XLA SPMD-replicated models.", "cores");
178 
179 auto* bfc_allocator_delay =
180     monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
181                                 "The total time spent running each graph "
182                                 "optimization pass in microseconds.");
183 
184 auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
185     "/tensorflow/tpu/variable_distribution_time",
186     "Time spent sending variables from primary task to other worker tasks "
187     "at the start of a call to TPUExecute.  Timer starts at RunGraph "
188     "invocation and ends when TPUExecute args are ready on the current task.");
189 
190 }  // namespace
191 
RecordTFDataAutotune(const string & name)192 void RecordTFDataAutotune(const string& name) {
193   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
194 }
195 
GetTFDataBytesConsumedCounter(const string & name)196 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
197   return tf_data_bytes_consumed_counter->GetCell(name);
198 }
199 
GetTFDataBytesProducedCounter(const string & name)200 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
201   return tf_data_bytes_produced_counter->GetCell(name);
202 }
203 
GetTFDataBytesReadCounter(const string & name)204 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
205   return tf_data_bytes_read_counter->GetCell(name);
206 }
207 
GetTFDataElementsCounter(const string & name)208 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
209   return tf_data_elements_counter->GetCell(name);
210 }
211 
GetTFDataModelGauge(const string & id)212 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
213     const string& id) {
214   return tf_data_model_gauge->GetCell(id);
215 }
216 
RecordTFDataBytesFetched(int64_t num_bytes)217 void RecordTFDataBytesFetched(int64_t num_bytes) {
218   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
219 }
220 
RecordTFDataExperiment(const string & name)221 void RecordTFDataExperiment(const string& name) {
222   tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
223 }
224 
RecordTFDataFingerprint(const string & name)225 void RecordTFDataFingerprint(const string& name) {
226   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
227 }
228 
RecordTFDataGetNextDuration(uint64 duration_us)229 void RecordTFDataGetNextDuration(uint64 duration_us) {
230   static auto* tf_data_get_next_duration_cell =
231       tf_data_get_next_duration_usecs_histogram->GetCell();
232   tf_data_get_next_duration_cell->Add(duration_us);
233 }
234 
RecordTFDataIteratorBusy(uint64 duration_us)235 void RecordTFDataIteratorBusy(uint64 duration_us) {
236   static auto* tf_data_iterator_busy_cell =
237       tf_data_iterator_busy_counter->GetCell();
238   tf_data_iterator_busy_cell->IncrementBy(duration_us);
239 }
240 
RecordTFDataIteratorLifetime(uint64 duration_us)241 void RecordTFDataIteratorLifetime(uint64 duration_us) {
242   static auto* tf_data_iterator_lifetime_cell =
243       tf_data_iterator_lifetime_counter->GetCell();
244   tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
245 }
246 
RecordTFDataOptimization(const string & name,int64_t num_changes)247 void RecordTFDataOptimization(const string& name, int64_t num_changes) {
248   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
249 }
250 
RecordTFDataServiceWorkerCreated()251 void RecordTFDataServiceWorkerCreated() {
252   tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
253 }
254 
RecordTFDataFilename(const string & name,const string & filename)255 void RecordTFDataFilename(const string& name, const string& filename) {
256   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
257 }
258 
RecordTFDataAutoShard(const string & id,data::AutoShardPolicy policy,int64 num_workers,int64 num_replicas)259 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
260                            int64 num_workers, int64 num_replicas) {
261   tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64>(policy));
262   tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
263   tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
264 }
265 
RecordParseDenseFeature(int64 num_features)266 void RecordParseDenseFeature(int64 num_features) {
267   static auto* parse_dense_feature_counter_cell =
268       parse_dense_feature_counter->GetCell();
269   parse_dense_feature_counter_cell->IncrementBy(num_features);
270 }
271 
RecordParseSparseFeature(int64_t num_features)272 void RecordParseSparseFeature(int64_t num_features) {
273   static auto* parse_sparse_feature_counter_cell =
274       parse_sparse_feature_counter->GetCell();
275   parse_sparse_feature_counter_cell->IncrementBy(num_features);
276 }
277 
RecordParseRaggedFeature(int64_t num_features)278 void RecordParseRaggedFeature(int64_t num_features) {
279   static auto* parse_ragged_feature_counter_cell =
280       parse_ragged_feature_counter->GetCell();
281   parse_ragged_feature_counter_cell->IncrementBy(num_features);
282 }
283 
RecordGraphInputTensors(const size_t size)284 void RecordGraphInputTensors(const size_t size) {
285   static auto* graph_run_input_tensor_bytes_cell =
286       graph_run_input_tensor_bytes->GetCell();
287   graph_run_input_tensor_bytes_cell->Add(size);
288 }
289 
RecordGraphOutputTensors(const size_t size)290 void RecordGraphOutputTensors(const size_t size) {
291   static auto* graph_run_output_tensor_bytes_cell =
292       graph_run_output_tensor_bytes->GetCell();
293   graph_run_output_tensor_bytes_cell->Add(size);
294 }
295 
RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica)296 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
297   xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
298       ->IncrementBy(1);
299 }
300 
UpdateGraphExecTime(const uint64 running_time_usecs)301 void UpdateGraphExecTime(const uint64 running_time_usecs) {
302   if (running_time_usecs > 0) {
303     static auto* graph_runs_cell = graph_runs->GetCell();
304     static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
305     static auto* graph_run_time_usecs_histogram_cell =
306         graph_run_time_usecs_histogram->GetCell();
307     graph_runs_cell->IncrementBy(1);
308     graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
309     graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
310   }
311 }
312 
UpdateGraphPendingQueueLength(uint64 len)313 void UpdateGraphPendingQueueLength(uint64 len) {
314   static auto* graph_pending_queue_length_cell =
315       graph_pending_queue_length_histogram->GetCell();
316   graph_pending_queue_length_cell->Add(len);
317 }
318 
UpdateGraphOptimizationPassTime(const string & pass_name,const uint64 running_time_usecs)319 void UpdateGraphOptimizationPassTime(const string& pass_name,
320                                      const uint64 running_time_usecs) {
321   if (running_time_usecs > 0) {
322     graph_optimization_usecs->GetCell("GraphOptimizationPass", pass_name)
323         ->IncrementBy(running_time_usecs);
324   }
325 }
326 
UpdateGrapplerPassTime(const string & pass_name,const uint64 running_time_usecs)327 void UpdateGrapplerPassTime(const string& pass_name,
328                             const uint64 running_time_usecs) {
329   if (running_time_usecs > 0) {
330     graph_optimization_usecs->GetCell("Grappler", pass_name)
331         ->IncrementBy(running_time_usecs);
332   }
333 }
334 
UpdateGraphBuildTime(const uint64 running_time_usecs)335 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
336   if (running_time_usecs > 0) {
337     static auto* build_graph_calls_cell = build_graph_calls->GetCell();
338     static auto* build_graph_time_usecs_cell =
339         build_graph_time_usecs->GetCell();
340     build_graph_calls_cell->IncrementBy(1);
341     build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
342   }
343 }
344 
UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs)345 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
346   if (distribution_time_usecs > 0) {
347     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
348         distribution_time_usecs);
349   }
350 }
351 
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)352 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
353   if (compilation_time_usecs > 0) {
354     static auto* xla_compilations_cell = xla_compilations->GetCell();
355     static auto* xla_compilation_time_usecs_cell =
356         xla_compilation_time_usecs->GetCell();
357     xla_compilations_cell->IncrementBy(1);
358     xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
359   }
360 }
361 
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)362 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
363   static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
364   if (delay_usecs > 0) {
365     bfc_allocator_delay_cell->IncrementBy(delay_usecs);
366   }
367 }
368 
RecordUnusedOutput(const string & op_name)369 void RecordUnusedOutput(const string& op_name) {
370   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
371 }
372 
373 }  // namespace metrics
374 }  // namespace tensorflow
375