1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/framework/metrics.h"
17
18 #include <cstdint>
19 #include <string>
20
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/lib/monitoring/counter.h"
23 #include "tensorflow/core/lib/monitoring/gauge.h"
24 #include "tensorflow/core/lib/monitoring/sampler.h"
25 #include "tensorflow/core/protobuf/data_service.pb.h"
26
27 namespace tensorflow {
28 namespace metrics {
29 namespace {
30
31 auto* graph_runs = monitoring::Counter<0>::New(
32 "/tensorflow/core/graph_runs",
33 "The number of graph executions used to collect "
34 "/tensorflow/core/graph_run_time_usecs");
35
36 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
37 "/tensorflow/core/graph_run_time_usecs",
38 "The total time spent on executing graphs in microseconds.");
39
40 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
41 {"/tensorflow/core/graph_run_time_usecs_histogram",
42 "The wall-clock time spent on executing graphs in microseconds."},
43 // Power of 2 with bucket count 20 (> 17 minutes)
44 {monitoring::Buckets::Exponential(1000, 2, 20)});
45
46 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
47 {"/tensorflow/core/graph_pending_queue_length_histogram",
48 "The number of pending (ready but not running) tasks in graph executor."},
49 // Power of 1.5 with bucket count 30 (> 191k)
50 {monitoring::Buckets::Exponential(1, 1.5, 30)});
51
52 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
53 {"/tensorflow/core/graph_run_input_tensor_bytes",
54 "The size of input tensors in bytes."},
55 // Power of 2 with bucket count 14 (256MB)
56 {monitoring::Buckets::Exponential(1, 4, 14)});
57
58 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
59 {"/tensorflow/core/graph_run_output_tensor_bytes",
60 "The size of output tensors in bytes."},
61 // Power of 2 with bucket count 14 (256MB)
62 {monitoring::Buckets::Exponential(1, 4, 14)});
63
64 auto* graph_unused_outputs = monitoring::Counter<1>::New(
65 "/tensorflow/core/graph_unused_outputs",
66 "The number of unused outputs for ops of a given type.", "name");
67
68 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
69 "/tensorflow/data/autotune", "tf.data autotuning", "name");
70
71 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
72 "/tensorflow/data/bytes_consumed",
73 "The number of bytes consumed by a tf.data Dataset.", "name");
74
75 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
76 "/tensorflow/data/bytes_produced",
77 "The number of bytes produced by a tf.data Dataset.", "name");
78
79 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
80 "/tensorflow/data/bytes_read",
81 "The number of bytes read by tf.data Dataset sources.", "name");
82
83 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
84 "/tensorflow/data/bytes_fetched",
85 "The number of bytes fetched from tf.data Dataset iterator.");
86
87 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
88 "/tensorflow/data/elements", "tf.data elements", "name");
89
90 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
91 "/tensorflow/data/experiment",
92 "The number of times tf.data experiment is applied to input pipelines.",
93 "name");
94
95 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
96 "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
97
98 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
99 {"/tensorflow/data/getnext_duration",
100 "Microseconds spent fetching an element from tf.data iterator."},
101 // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
102 {monitoring::Buckets::Explicit(
103 {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
104
105 auto* tf_data_used_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
106 {"/tensorflow/data/used_vs_budget_ratio",
107 "Ratio of tf.data used ram over ram budget when running optimization."},
108 // Uniform linear buckets with count 10 from 0 to 2
109 {monitoring::Buckets::Explicit(
110 {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
111
112 auto* tf_data_buffered_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
113 {"/tensorflow/data/buffered_vs_budget_ratio",
114 "Ratio of tf.data max buffer bytes over ram budget when running "
115 "optimization."},
116 // Uniform linear buckets with count 10 from 0 to 2
117 {monitoring::Buckets::Explicit(
118 {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
119
120 auto* tf_data_iterator_busy_counter =
121 monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
122 "The time (in microseconds) during which a "
123 "tf.data iterator was busy processing at "
124 "least one `GetNext()` request.");
125
126 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
127 "/tensorflow/data/iterator_lifetime",
128 "The time (in microseconds) between a tf.data iterator receiving the first "
129 "`GetNext()` request and responding to the last `GetNext()` request.");
130
131 auto* tf_data_iterator_gap_msec_histogram = monitoring::Sampler<0>::New(
132 {"/tensorflow/data/iterator_gap",
133 "The time (in milliseconds) between a tf.data iterator responding to a "
134 "`GetNext()` request and receiving the next `GetNext()` request."},
135 // Power of 1.5 with bucket count of 20 (from 1 msec to about 2.2 secs).
136 {monitoring::Buckets::Exponential(1, 1.5, 20)});
137
138 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
139 "/tensorflow/data/optimization", "tf.data optimization", "name");
140
141 auto* tf_data_service_workers_created_counter =
142 monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
143 "Number of tf.data service workers created");
144
145 auto* tf_data_service_jobs_created_counter = monitoring::Counter<2>::New(
146 "/tensorflow/data/service/jobs_created", "Number of tf.data service jobs.",
147 "processing_mode", "coordinated_read");
148
149 auto* tf_data_service_client_iterators_counter = monitoring::Counter<4>::New(
150 "/tensorflow/data/service/client_iterators",
151 "Number of tf.data service client iterators created.", "worker_uid",
152 "deployment_mode", "processing_mode", "is_coordinated_read");
153
154 auto* tf_data_service_cross_trainer_cache_queries_counter =
155 monitoring::Counter<1>::New(
156 "/tensorflow/data/service/cross_trainer_cache_queries",
157 "tf.data service cross-trainer cache queries counter. The result can "
158 "be hit or miss.",
159 "cache_hit");
160
161 auto* tf_data_service_cross_trainer_cache_size_bytes =
162 monitoring::Gauge<int64_t, 0>::New(
163 "/tensorflow/data/service/cross_trainer_cache_size_bytes",
164 "tf.data service cross-trainer cache memory usage in bytes.");
165
166 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
167 "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
168 "name", "filename");
169
170 auto* tf_data_model_gauge =
171 monitoring::Gauge<std::function<std::string()>, 1>::New(
172 "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
173
174 auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
175 "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
176 "name");
177
178 auto* tf_data_auto_shard_rewrite_batch_size_eligible =
179 monitoring::Counter<1>::New(
180 "/tensorflow/data/autoshard_rewrite_batch_size/eligible",
181 "Whether tf.data pipelines that are eligible for autoshard "
182 "to rewrite the batch size.",
183 "eligible");
184
185 auto* tf_data_auto_shard_rewrite_batch_size_reason =
186 monitoring::Counter<1>::New(
187 "/tensorflow/data/autoshard_rewrite_batch_size/reason",
188 "The reasons that tf.data pipelines are ineligible for autoshard "
189 "to rewrite the batch size.",
190 "reason");
191
192 auto* tf_data_autotune_stopping_criteria_counter =
193 monitoring::Counter<1>::New("/tensorflow/data/autotune_stopping_criteria",
194 "The number of times each tf.data autotune "
195 "algorithm stopping criterion is met.",
196 "name");
197
198 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
199 "/tensorflow/data/dense_feature",
200 "The number of dense features parsed by ops for parsing tf.Example.");
201
202 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
203 "/tensorflow/data/sparse_feature",
204 "The number of sparse features parsed by ops for parsing tf.Example.");
205
206 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
207 "/tensorflow/data/ragged_feature",
208 "The number of ragged features parsed by ops for parsing tf.Example.");
209
210 auto* build_graph_calls = monitoring::Counter<0>::New(
211 "/tensorflow/core/graph_build_calls",
212 "The number of times TensorFlow has created a new client graph. "
213 "A client graph is a sub-graph of the full graph, induced by a set of "
214 "options, including the requested feeds and fetches. It includes time "
215 "spent optimizing the graph with Grappler, and time spent pruning the "
216 "sub-graph.");
217
218 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
219 "/tensorflow/core/graph_build_time_usecs",
220 "The amount of time TensorFlow has spent creating new client graphs in "
221 "microseconds. "
222 "A client graph is a sub-graph of the full graph, induced by a set of "
223 "options, including the requested feeds and fetches. It includes time "
224 "spent optimizing the graph with Grappler, and time spent pruning the "
225 "sub-graph.");
226
227 auto* xla_compilations = monitoring::Counter<0>::New(
228 "/tensorflow/core/xla_compilations",
229 "The number of XLA compilations used to collect "
230 "/tensorflow/core/xla_compilation_time_usecs");
231
232 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
233 "/tensorflow/core/xla_compilation_time_usecs",
234 "The total time spent on compiling XLA graphs in microseconds.");
235
236 auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
237 "/tensorflow/tpu/xla_spmd_cores_per_replica",
238 "The number of cores used by XLA SPMD-replicated models.", "cores");
239
240 auto* bfc_allocator_delay =
241 monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
242 "The total time spent running each graph "
243 "optimization pass in microseconds.");
244
245 auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
246 "/tensorflow/tpu/variable_distribution_time",
247 "Time spent sending variables from primary task to other worker tasks "
248 "at the start of a call to TPUExecute. Timer starts at RunGraph "
249 "invocation and ends when TPUExecute args are ready on the current task.");
250
251 auto* test_counters =
252 monitoring::Counter<2>::New("/tensorflow/core/test_counters",
253 "Counters used for testing.", "name", "label");
254
255 } // namespace
256
257 auto* tpu_op_error_counter = monitoring::Counter<2>::New(
258 "/tensorflow/tpu/op_error_count",
259 "Count the tpu related errors by op and error_type.", "op", "error_type");
260
261 auto* eager_client_error_counter = monitoring::Counter<2>::New(
262 "/tensorflow/core/eager_client_error_count",
263 "Count the errors in eager client as a central place.", "error_source",
264 "error_type");
265
GetGraphOptimizationCounter()266 monitoring::Counter<2>* GetGraphOptimizationCounter() {
267 static auto* graph_optimization_counter =
268 monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
269 "The total time spent running each graph "
270 "optimization pass in microseconds.",
271 "kind", "name");
272 return graph_optimization_counter;
273 }
274
RecordTFDataAutotune(const string & name)275 void RecordTFDataAutotune(const string& name) {
276 tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
277 }
278
GetTFDataBytesConsumedCounter(const string & name)279 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
280 return tf_data_bytes_consumed_counter->GetCell(name);
281 }
282
GetTFDataBytesProducedCounter(const string & name)283 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
284 return tf_data_bytes_produced_counter->GetCell(name);
285 }
286
GetTFDataBytesReadCounter(const string & name)287 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
288 return tf_data_bytes_read_counter->GetCell(name);
289 }
290
GetTFDataElementsCounter(const string & name)291 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
292 return tf_data_elements_counter->GetCell(name);
293 }
294
GetTFDataModelGauge(const string & id)295 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
296 const string& id) {
297 return tf_data_model_gauge->GetCell(id);
298 }
299
RecordTFDataBytesFetched(int64_t num_bytes)300 void RecordTFDataBytesFetched(int64_t num_bytes) {
301 tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
302 }
303
RecordTFDataExperiment(const string & name)304 void RecordTFDataExperiment(const string& name) {
305 tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
306 }
307
RecordTFDataFingerprint(const string & name)308 void RecordTFDataFingerprint(const string& name) {
309 tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
310 }
311
RecordTFDataGetNextDuration(uint64 duration_us)312 void RecordTFDataGetNextDuration(uint64 duration_us) {
313 static auto* tf_data_get_next_duration_cell =
314 tf_data_get_next_duration_usecs_histogram->GetCell();
315 tf_data_get_next_duration_cell->Add(duration_us);
316 }
317
RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio)318 void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio) {
319 static auto* tf_data_used_vs_budget_ratio_histogram_cell =
320 tf_data_used_vs_budget_ratio_histogram->GetCell();
321 tf_data_used_vs_budget_ratio_histogram_cell->Add(ratio);
322 }
323
RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio)324 void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
325 static auto* tf_data_buffered_vs_budget_ratio_histogram_cell =
326 tf_data_buffered_vs_budget_ratio_histogram->GetCell();
327 tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
328 }
329
RecordTFDataIteratorBusy(uint64 duration_us)330 void RecordTFDataIteratorBusy(uint64 duration_us) {
331 static auto* tf_data_iterator_busy_cell =
332 tf_data_iterator_busy_counter->GetCell();
333 tf_data_iterator_busy_cell->IncrementBy(duration_us);
334 }
335
RecordTFDataIteratorLifetime(uint64 duration_us)336 void RecordTFDataIteratorLifetime(uint64 duration_us) {
337 static auto* tf_data_iterator_lifetime_cell =
338 tf_data_iterator_lifetime_counter->GetCell();
339 tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
340 }
341
RecordTFDataIteratorGap(uint64 duration_us)342 void RecordTFDataIteratorGap(uint64 duration_us) {
343 static auto* tf_data_iterator_gap_msec_histogram_cell =
344 tf_data_iterator_gap_msec_histogram->GetCell();
345 tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001);
346 }
347
RecordTFDataOptimization(const string & name,int64_t num_changes)348 void RecordTFDataOptimization(const string& name, int64_t num_changes) {
349 tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
350 }
351
RecordTFDataServiceWorkerCreated()352 void RecordTFDataServiceWorkerCreated() {
353 tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
354 }
355
RecordTFDataServiceJobsCreated(const tensorflow::data::ProcessingModeDef & processing_mode,bool is_coordinated_read)356 void RecordTFDataServiceJobsCreated(
357 const tensorflow::data::ProcessingModeDef& processing_mode,
358 bool is_coordinated_read) {
359 const std::string sharding_policy_str =
360 data::ProcessingModeDef::ShardingPolicy_Name(
361 processing_mode.sharding_policy());
362 const std::string coordinated_read_str =
363 is_coordinated_read ? "true" : "false";
364 tf_data_service_jobs_created_counter
365 ->GetCell(sharding_policy_str, coordinated_read_str)
366 ->IncrementBy(1);
367 }
368
RecordTFDataServiceClientIterators(int64_t worker_uid,tensorflow::data::DeploymentMode deployment_mode,const tensorflow::data::ProcessingModeDef & processing_mode,bool is_coordinated_read)369 void RecordTFDataServiceClientIterators(
370 int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
371 const tensorflow::data::ProcessingModeDef& processing_mode,
372 bool is_coordinated_read) {
373 const std::string deployment_mode_str =
374 tensorflow::data::DeploymentMode_Name(deployment_mode);
375 const std::string sharding_policy_str =
376 data::ProcessingModeDef::ShardingPolicy_Name(
377 processing_mode.sharding_policy());
378 const std::string coordinated_read_str =
379 is_coordinated_read ? "true" : "false";
380 tf_data_service_client_iterators_counter
381 ->GetCell(absl::StrCat(worker_uid), deployment_mode_str,
382 sharding_policy_str, coordinated_read_str)
383 ->IncrementBy(1);
384 }
385
RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit)386 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit) {
387 std::string cache_hit_str = cache_hit ? "true" : "false";
388 tf_data_service_cross_trainer_cache_queries_counter->GetCell(cache_hit_str)
389 ->IncrementBy(1);
390 }
391
RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes)392 void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes) {
393 tf_data_service_cross_trainer_cache_size_bytes->GetCell()->Set(
394 static_cast<int64_t>(bytes));
395 }
396
RecordTFDataFilename(const string & name,const string & filename)397 void RecordTFDataFilename(const string& name, const string& filename) {
398 tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
399 }
400
RecordTFDataAutoShard(const string & id,data::AutoShardPolicy policy,int64 num_workers,int64 num_replicas)401 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
402 int64 num_workers, int64 num_replicas) {
403 tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
404 tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
405 tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
406 }
407
RecordTFDataAutoShardRewriteBatchSize(bool eligible,const std::vector<string> & ineligible_reason)408 void RecordTFDataAutoShardRewriteBatchSize(
409 bool eligible, const std::vector<string>& ineligible_reason) {
410 tf_data_auto_shard_rewrite_batch_size_eligible
411 ->GetCell(eligible ? "true" : "false")
412 ->IncrementBy(1);
413 for (const string& reason : ineligible_reason) {
414 tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
415 1);
416 }
417 }
418
RecordTFDataAutotuneStoppingCriteria(const string & name)419 void RecordTFDataAutotuneStoppingCriteria(const string& name) {
420 tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
421 }
422
RecordParseDenseFeature(int64 num_features)423 void RecordParseDenseFeature(int64 num_features) {
424 static auto* parse_dense_feature_counter_cell =
425 parse_dense_feature_counter->GetCell();
426 parse_dense_feature_counter_cell->IncrementBy(num_features);
427 }
428
RecordParseSparseFeature(int64_t num_features)429 void RecordParseSparseFeature(int64_t num_features) {
430 static auto* parse_sparse_feature_counter_cell =
431 parse_sparse_feature_counter->GetCell();
432 parse_sparse_feature_counter_cell->IncrementBy(num_features);
433 }
434
RecordParseRaggedFeature(int64_t num_features)435 void RecordParseRaggedFeature(int64_t num_features) {
436 static auto* parse_ragged_feature_counter_cell =
437 parse_ragged_feature_counter->GetCell();
438 parse_ragged_feature_counter_cell->IncrementBy(num_features);
439 }
440
RecordGraphInputTensors(const size_t size)441 void RecordGraphInputTensors(const size_t size) {
442 static auto* graph_run_input_tensor_bytes_cell =
443 graph_run_input_tensor_bytes->GetCell();
444 graph_run_input_tensor_bytes_cell->Add(size);
445 }
446
RecordGraphOutputTensors(const size_t size)447 void RecordGraphOutputTensors(const size_t size) {
448 static auto* graph_run_output_tensor_bytes_cell =
449 graph_run_output_tensor_bytes->GetCell();
450 graph_run_output_tensor_bytes_cell->Add(size);
451 }
452
RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica)453 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
454 xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
455 ->IncrementBy(1);
456 }
457
UpdateGraphExecTime(const uint64 running_time_usecs)458 void UpdateGraphExecTime(const uint64 running_time_usecs) {
459 if (running_time_usecs > 0) {
460 static auto* graph_runs_cell = graph_runs->GetCell();
461 static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
462 static auto* graph_run_time_usecs_histogram_cell =
463 graph_run_time_usecs_histogram->GetCell();
464 graph_runs_cell->IncrementBy(1);
465 graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
466 graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
467 }
468 }
469
UpdateGraphPendingQueueLength(uint64 len)470 void UpdateGraphPendingQueueLength(uint64 len) {
471 static auto* graph_pending_queue_length_cell =
472 graph_pending_queue_length_histogram->GetCell();
473 graph_pending_queue_length_cell->Add(len);
474 }
475
UpdateGraphBuildTime(const uint64 running_time_usecs)476 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
477 if (running_time_usecs > 0) {
478 static auto* build_graph_calls_cell = build_graph_calls->GetCell();
479 static auto* build_graph_time_usecs_cell =
480 build_graph_time_usecs->GetCell();
481 build_graph_calls_cell->IncrementBy(1);
482 build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
483 }
484 }
485
UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs)486 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
487 if (distribution_time_usecs > 0) {
488 tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
489 distribution_time_usecs);
490 }
491 }
492
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)493 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
494 if (compilation_time_usecs > 0) {
495 static auto* xla_compilations_cell = xla_compilations->GetCell();
496 static auto* xla_compilation_time_usecs_cell =
497 xla_compilation_time_usecs->GetCell();
498 xla_compilations_cell->IncrementBy(1);
499 xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
500 }
501 }
502
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)503 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
504 static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
505 if (delay_usecs > 0) {
506 bfc_allocator_delay_cell->IncrementBy(delay_usecs);
507 }
508 }
509
RecordUnusedOutput(const string & op_name)510 void RecordUnusedOutput(const string& op_name) {
511 graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
512 }
513
IncrementTestCounter(const string & name,const string & label)514 void IncrementTestCounter(const string& name, const string& label) {
515 test_counters->GetCell(name, label)->IncrementBy(1);
516 }
517
TestCounter(const string & name,const string & label)518 const monitoring::CounterCell* TestCounter(const string& name,
519 const string& label) {
520 return test_counters->GetCell(name, label);
521 }
522
TestDelta(const string & name,const string & label)523 TestDelta::TestDelta(const string& name, const string& label)
524 : cell_(TestCounter(name, label)) {
525 Reset();
526 }
527
Reset()528 void TestDelta::Reset() { last_value_ = cell_->value(); }
529
Get()530 int64 TestDelta::Get() { return cell_->value() - last_value_; }
531
UpdateTfMlirGraphOptimizationPassStateCounter(const std::string & pass_state,const std::string & processing_state)532 void UpdateTfMlirGraphOptimizationPassStateCounter(
533 const std::string& pass_state, const std::string& processing_state) {
534 static auto* metric = monitoring::Counter<2>::New(
535 "/tensorflow/core/tf_mlir_update_graph_optimization_pass_state_counter",
536 "Tracks changes in a graph's UpdateTfMlirGraphOptimizationPassState",
537 "PassState", "ProcessingState");
538
539 metric->GetCell(pass_state, processing_state)->IncrementBy(1);
540 }
541
UpdateTfMlirBridgeFirstPhaseCounter(const std::string & device_type,const std::string & bridge_version,bool fallback_enabled,const std::string & result)542 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
543 const std::string& bridge_version,
544 bool fallback_enabled,
545 const std::string& result) {
546 static auto* metric = monitoring::Counter<4>::New(
547 "/tensorflow/core/tf_mlir_bridge_first_phase_count",
548 "Tracks processing state in first phase of mlir bridge", "device",
549 "version", "fallback", "result");
550 std::string fallback_status =
551 fallback_enabled ? "fallback_enabled" : "fallback_disabled";
552 metric->GetCell(device_type, bridge_version, fallback_status, result)
553 ->IncrementBy(1);
554 }
555
UpdateTpuErrorCounter(const string & op,const string & error_type)556 void UpdateTpuErrorCounter(const string& op, const string& error_type) {
557 tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
558 }
559
UpdateEagerClientErrorCounter(const string & error_source,const string & error_type)560 void UpdateEagerClientErrorCounter(const string& error_source,
561 const string& error_type) {
562 eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1);
563 }
564
UpdateTfMlirBridgeGraphAnalysisPerOp(const std::string & op_name,const std::string & construction_context,bool is_single_core_inference_mode,const std::string & num_replicas,const std::string & num_cores_per_replica,const std::string & use_tpu,const std::string & allow_soft_placement,const std::string & use_spmd_for_xla_partitioning,const std::string & unsupported_reason,bool has_unsupported_features)565 void UpdateTfMlirBridgeGraphAnalysisPerOp(
566 const std::string& op_name, const std::string& construction_context,
567 bool is_single_core_inference_mode, const std::string& num_replicas,
568 const std::string& num_cores_per_replica, const std::string& use_tpu,
569 const std::string& allow_soft_placement,
570 const std::string& use_spmd_for_xla_partitioning,
571 const std::string& unsupported_reason, bool has_unsupported_features) {
572 static auto* metric = monitoring::Counter<10>::New(
573 "/tensorflow/core/tf_mlir_bridge_graph_analysis_per_op",
574 "Tracks processing state per op in first phase of mlir bridge", "op_name",
575 "construction_context", "is_single_core_inference_mode", "num_replicas",
576 "num_cores_per_replica", "use_tpu", "allow_soft_placement",
577 "use_spmd_for_xla_partitioning", "unsupported_reason",
578 "has_unsupported_features");
579
580 metric
581 ->GetCell(op_name, construction_context,
582 is_single_core_inference_mode ? "Yes" : "No", num_replicas,
583 num_cores_per_replica, use_tpu, allow_soft_placement,
584 use_spmd_for_xla_partitioning, unsupported_reason,
585 has_unsupported_features ? "Yes" : "No")
586 ->IncrementBy(1);
587 }
588
589 } // namespace metrics
590 } // namespace tensorflow
591