• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/framework/metrics.h"
17 
18 #include <cstdint>
19 #include <string>
20 
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/lib/monitoring/counter.h"
23 #include "tensorflow/core/lib/monitoring/gauge.h"
24 #include "tensorflow/core/lib/monitoring/sampler.h"
25 #include "tensorflow/core/protobuf/data_service.pb.h"
26 
27 namespace tensorflow {
28 namespace metrics {
29 namespace {
30 
31 auto* graph_runs = monitoring::Counter<0>::New(
32     "/tensorflow/core/graph_runs",
33     "The number of graph executions used to collect "
34     "/tensorflow/core/graph_run_time_usecs");
35 
36 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
37     "/tensorflow/core/graph_run_time_usecs",
38     "The total time spent on executing graphs in microseconds.");
39 
40 auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
41     {"/tensorflow/core/graph_run_time_usecs_histogram",
42      "The wall-clock time spent on executing graphs in microseconds."},
43     // Power of 2 with bucket count 20 (> 17 minutes)
44     {monitoring::Buckets::Exponential(1000, 2, 20)});
45 
46 auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
47     {"/tensorflow/core/graph_pending_queue_length_histogram",
48      "The number of pending (ready but not running) tasks in graph executor."},
49     // Power of 1.5 with bucket count 30 (> 191k)
50     {monitoring::Buckets::Exponential(1, 1.5, 30)});
51 
52 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
53     {"/tensorflow/core/graph_run_input_tensor_bytes",
54      "The size of input tensors in bytes."},
55     // Power of 2 with bucket count 14 (256MB)
56     {monitoring::Buckets::Exponential(1, 4, 14)});
57 
58 auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
59     {"/tensorflow/core/graph_run_output_tensor_bytes",
60      "The size of output tensors in bytes."},
61     // Power of 2 with bucket count 14 (256MB)
62     {monitoring::Buckets::Exponential(1, 4, 14)});
63 
64 auto* graph_unused_outputs = monitoring::Counter<1>::New(
65     "/tensorflow/core/graph_unused_outputs",
66     "The number of unused outputs for ops of a given type.", "name");
67 
68 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
69     "/tensorflow/data/autotune", "tf.data autotuning", "name");
70 
71 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
72     "/tensorflow/data/bytes_consumed",
73     "The number of bytes consumed by a tf.data Dataset.", "name");
74 
75 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
76     "/tensorflow/data/bytes_produced",
77     "The number of bytes produced by a tf.data Dataset.", "name");
78 
79 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
80     "/tensorflow/data/bytes_read",
81     "The number of bytes read by tf.data Dataset sources.", "name");
82 
83 auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
84     "/tensorflow/data/bytes_fetched",
85     "The number of bytes fetched from tf.data Dataset iterator.");
86 
87 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
88     "/tensorflow/data/elements", "tf.data elements", "name");
89 
90 auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
91     "/tensorflow/data/experiment",
92     "The number of times tf.data experiment is applied to input pipelines.",
93     "name");
94 
95 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
96     "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
97 
98 auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
99     {"/tensorflow/data/getnext_duration",
100      "Microseconds spent fetching an element from tf.data iterator."},
101     // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
102     {monitoring::Buckets::Explicit(
103         {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
104 
105 auto* tf_data_used_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
106     {"/tensorflow/data/used_vs_budget_ratio",
107      "Ratio of tf.data used ram over ram budget when running optimization."},
108     // Uniform linear buckets with count 10 from 0 to 2
109     {monitoring::Buckets::Explicit(
110         {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
111 
112 auto* tf_data_buffered_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
113     {"/tensorflow/data/buffered_vs_budget_ratio",
114      "Ratio of tf.data max buffer bytes over ram budget when running "
115      "optimization."},
116     // Uniform linear buckets with count 10 from 0 to 2
117     {monitoring::Buckets::Explicit(
118         {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
119 
120 auto* tf_data_iterator_busy_counter =
121     monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
122                                 "The time (in microseconds) during which a "
123                                 "tf.data iterator was busy processing at "
124                                 "least one `GetNext()` request.");
125 
126 auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
127     "/tensorflow/data/iterator_lifetime",
128     "The time (in microseconds) between a tf.data iterator receiving the first "
129     "`GetNext()` request and responding to the last `GetNext()` request.");
130 
131 auto* tf_data_iterator_gap_msec_histogram = monitoring::Sampler<0>::New(
132     {"/tensorflow/data/iterator_gap",
133      "The time (in milliseconds) between a tf.data iterator responding to a "
134      "`GetNext()` request and receiving the next `GetNext()` request."},
135     // Power of 1.5 with bucket count of 20 (from 1 msec to about 2.2 secs).
136     {monitoring::Buckets::Exponential(1, 1.5, 20)});
137 
138 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
139     "/tensorflow/data/optimization", "tf.data optimization", "name");
140 
141 auto* tf_data_service_workers_created_counter =
142     monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
143                                 "Number of tf.data service workers created");
144 
145 auto* tf_data_service_jobs_created_counter = monitoring::Counter<2>::New(
146     "/tensorflow/data/service/jobs_created", "Number of tf.data service jobs.",
147     "processing_mode", "coordinated_read");
148 
149 auto* tf_data_service_client_iterators_counter = monitoring::Counter<4>::New(
150     "/tensorflow/data/service/client_iterators",
151     "Number of tf.data service client iterators created.", "worker_uid",
152     "deployment_mode", "processing_mode", "is_coordinated_read");
153 
154 auto* tf_data_service_cross_trainer_cache_queries_counter =
155     monitoring::Counter<1>::New(
156         "/tensorflow/data/service/cross_trainer_cache_queries",
157         "tf.data service cross-trainer cache queries counter. The result can "
158         "be hit or miss.",
159         "cache_hit");
160 
161 auto* tf_data_service_cross_trainer_cache_size_bytes =
162     monitoring::Gauge<int64_t, 0>::New(
163         "/tensorflow/data/service/cross_trainer_cache_size_bytes",
164         "tf.data service cross-trainer cache memory usage in bytes.");
165 
166 auto* tf_data_filename_counter = monitoring::Counter<2>::New(
167     "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
168     "name", "filename");
169 
170 auto* tf_data_model_gauge =
171     monitoring::Gauge<std::function<std::string()>, 1>::New(
172         "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
173 
174 auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
175     "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
176     "name");
177 
178 auto* tf_data_auto_shard_rewrite_batch_size_eligible =
179     monitoring::Counter<1>::New(
180         "/tensorflow/data/autoshard_rewrite_batch_size/eligible",
181         "Whether tf.data pipelines that are eligible for autoshard "
182         "to rewrite the batch size.",
183         "eligible");
184 
185 auto* tf_data_auto_shard_rewrite_batch_size_reason =
186     monitoring::Counter<1>::New(
187         "/tensorflow/data/autoshard_rewrite_batch_size/reason",
188         "The reasons that tf.data pipelines are ineligible for autoshard "
189         "to rewrite the batch size.",
190         "reason");
191 
192 auto* tf_data_autotune_stopping_criteria_counter =
193     monitoring::Counter<1>::New("/tensorflow/data/autotune_stopping_criteria",
194                                 "The number of times each tf.data autotune "
195                                 "algorithm stopping criterion is met.",
196                                 "name");
197 
198 auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
199     "/tensorflow/data/dense_feature",
200     "The number of dense features parsed by ops for parsing tf.Example.");
201 
202 auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
203     "/tensorflow/data/sparse_feature",
204     "The number of sparse features parsed by ops for parsing tf.Example.");
205 
206 auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
207     "/tensorflow/data/ragged_feature",
208     "The number of ragged features parsed by ops for parsing tf.Example.");
209 
210 auto* build_graph_calls = monitoring::Counter<0>::New(
211     "/tensorflow/core/graph_build_calls",
212     "The number of times TensorFlow has created a new client graph. "
213     "A client graph is a sub-graph of the full graph, induced by a set of "
214     "options, including the requested feeds and fetches. It includes time "
215     "spent optimizing the graph with Grappler, and time spent pruning the "
216     "sub-graph.");
217 
218 auto* build_graph_time_usecs = monitoring::Counter<0>::New(
219     "/tensorflow/core/graph_build_time_usecs",
220     "The amount of time TensorFlow has spent creating new client graphs in "
221     "microseconds. "
222     "A client graph is a sub-graph of the full graph, induced by a set of "
223     "options, including the requested feeds and fetches. It includes time "
224     "spent optimizing the graph with Grappler, and time spent pruning the "
225     "sub-graph.");
226 
227 auto* xla_compilations = monitoring::Counter<0>::New(
228     "/tensorflow/core/xla_compilations",
229     "The number of XLA compilations used to collect "
230     "/tensorflow/core/xla_compilation_time_usecs");
231 
232 auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
233     "/tensorflow/core/xla_compilation_time_usecs",
234     "The total time spent on compiling XLA graphs in microseconds.");
235 
236 auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
237     "/tensorflow/tpu/xla_spmd_cores_per_replica",
238     "The number of cores used by XLA SPMD-replicated models.", "cores");
239 
240 auto* bfc_allocator_delay =
241     monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
242                                 "The total time spent running each graph "
243                                 "optimization pass in microseconds.");
244 
245 auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
246     "/tensorflow/tpu/variable_distribution_time",
247     "Time spent sending variables from primary task to other worker tasks "
248     "at the start of a call to TPUExecute.  Timer starts at RunGraph "
249     "invocation and ends when TPUExecute args are ready on the current task.");
250 
251 auto* test_counters =
252     monitoring::Counter<2>::New("/tensorflow/core/test_counters",
253                                 "Counters used for testing.", "name", "label");
254 
255 }  // namespace
256 
257 auto* tpu_op_error_counter = monitoring::Counter<2>::New(
258     "/tensorflow/tpu/op_error_count",
259     "Count the tpu related errors by op and error_type.", "op", "error_type");
260 
261 auto* eager_client_error_counter = monitoring::Counter<2>::New(
262     "/tensorflow/core/eager_client_error_count",
263     "Count the errors in eager client as a central place.", "error_source",
264     "error_type");
265 
GetGraphOptimizationCounter()266 monitoring::Counter<2>* GetGraphOptimizationCounter() {
267   static auto* graph_optimization_counter =
268       monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
269                                   "The total time spent running each graph "
270                                   "optimization pass in microseconds.",
271                                   "kind", "name");
272   return graph_optimization_counter;
273 }
274 
RecordTFDataAutotune(const string & name)275 void RecordTFDataAutotune(const string& name) {
276   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
277 }
278 
GetTFDataBytesConsumedCounter(const string & name)279 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
280   return tf_data_bytes_consumed_counter->GetCell(name);
281 }
282 
GetTFDataBytesProducedCounter(const string & name)283 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
284   return tf_data_bytes_produced_counter->GetCell(name);
285 }
286 
GetTFDataBytesReadCounter(const string & name)287 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
288   return tf_data_bytes_read_counter->GetCell(name);
289 }
290 
GetTFDataElementsCounter(const string & name)291 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
292   return tf_data_elements_counter->GetCell(name);
293 }
294 
GetTFDataModelGauge(const string & id)295 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
296     const string& id) {
297   return tf_data_model_gauge->GetCell(id);
298 }
299 
RecordTFDataBytesFetched(int64_t num_bytes)300 void RecordTFDataBytesFetched(int64_t num_bytes) {
301   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
302 }
303 
RecordTFDataExperiment(const string & name)304 void RecordTFDataExperiment(const string& name) {
305   tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
306 }
307 
RecordTFDataFingerprint(const string & name)308 void RecordTFDataFingerprint(const string& name) {
309   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
310 }
311 
RecordTFDataGetNextDuration(uint64 duration_us)312 void RecordTFDataGetNextDuration(uint64 duration_us) {
313   static auto* tf_data_get_next_duration_cell =
314       tf_data_get_next_duration_usecs_histogram->GetCell();
315   tf_data_get_next_duration_cell->Add(duration_us);
316 }
317 
RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio)318 void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio) {
319   static auto* tf_data_used_vs_budget_ratio_histogram_cell =
320       tf_data_used_vs_budget_ratio_histogram->GetCell();
321   tf_data_used_vs_budget_ratio_histogram_cell->Add(ratio);
322 }
323 
RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio)324 void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
325   static auto* tf_data_buffered_vs_budget_ratio_histogram_cell =
326       tf_data_buffered_vs_budget_ratio_histogram->GetCell();
327   tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
328 }
329 
RecordTFDataIteratorBusy(uint64 duration_us)330 void RecordTFDataIteratorBusy(uint64 duration_us) {
331   static auto* tf_data_iterator_busy_cell =
332       tf_data_iterator_busy_counter->GetCell();
333   tf_data_iterator_busy_cell->IncrementBy(duration_us);
334 }
335 
RecordTFDataIteratorLifetime(uint64 duration_us)336 void RecordTFDataIteratorLifetime(uint64 duration_us) {
337   static auto* tf_data_iterator_lifetime_cell =
338       tf_data_iterator_lifetime_counter->GetCell();
339   tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
340 }
341 
RecordTFDataIteratorGap(uint64 duration_us)342 void RecordTFDataIteratorGap(uint64 duration_us) {
343   static auto* tf_data_iterator_gap_msec_histogram_cell =
344       tf_data_iterator_gap_msec_histogram->GetCell();
345   tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001);
346 }
347 
RecordTFDataOptimization(const string & name,int64_t num_changes)348 void RecordTFDataOptimization(const string& name, int64_t num_changes) {
349   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
350 }
351 
RecordTFDataServiceWorkerCreated()352 void RecordTFDataServiceWorkerCreated() {
353   tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
354 }
355 
RecordTFDataServiceJobsCreated(const tensorflow::data::ProcessingModeDef & processing_mode,bool is_coordinated_read)356 void RecordTFDataServiceJobsCreated(
357     const tensorflow::data::ProcessingModeDef& processing_mode,
358     bool is_coordinated_read) {
359   const std::string sharding_policy_str =
360       data::ProcessingModeDef::ShardingPolicy_Name(
361           processing_mode.sharding_policy());
362   const std::string coordinated_read_str =
363       is_coordinated_read ? "true" : "false";
364   tf_data_service_jobs_created_counter
365       ->GetCell(sharding_policy_str, coordinated_read_str)
366       ->IncrementBy(1);
367 }
368 
RecordTFDataServiceClientIterators(int64_t worker_uid,tensorflow::data::DeploymentMode deployment_mode,const tensorflow::data::ProcessingModeDef & processing_mode,bool is_coordinated_read)369 void RecordTFDataServiceClientIterators(
370     int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
371     const tensorflow::data::ProcessingModeDef& processing_mode,
372     bool is_coordinated_read) {
373   const std::string deployment_mode_str =
374       tensorflow::data::DeploymentMode_Name(deployment_mode);
375   const std::string sharding_policy_str =
376       data::ProcessingModeDef::ShardingPolicy_Name(
377           processing_mode.sharding_policy());
378   const std::string coordinated_read_str =
379       is_coordinated_read ? "true" : "false";
380   tf_data_service_client_iterators_counter
381       ->GetCell(absl::StrCat(worker_uid), deployment_mode_str,
382                 sharding_policy_str, coordinated_read_str)
383       ->IncrementBy(1);
384 }
385 
RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit)386 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit) {
387   std::string cache_hit_str = cache_hit ? "true" : "false";
388   tf_data_service_cross_trainer_cache_queries_counter->GetCell(cache_hit_str)
389       ->IncrementBy(1);
390 }
391 
RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes)392 void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes) {
393   tf_data_service_cross_trainer_cache_size_bytes->GetCell()->Set(
394       static_cast<int64_t>(bytes));
395 }
396 
RecordTFDataFilename(const string & name,const string & filename)397 void RecordTFDataFilename(const string& name, const string& filename) {
398   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
399 }
400 
RecordTFDataAutoShard(const string & id,data::AutoShardPolicy policy,int64 num_workers,int64 num_replicas)401 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
402                            int64 num_workers, int64 num_replicas) {
403   tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
404   tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
405   tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
406 }
407 
RecordTFDataAutoShardRewriteBatchSize(bool eligible,const std::vector<string> & ineligible_reason)408 void RecordTFDataAutoShardRewriteBatchSize(
409     bool eligible, const std::vector<string>& ineligible_reason) {
410   tf_data_auto_shard_rewrite_batch_size_eligible
411       ->GetCell(eligible ? "true" : "false")
412       ->IncrementBy(1);
413   for (const string& reason : ineligible_reason) {
414     tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
415         1);
416   }
417 }
418 
RecordTFDataAutotuneStoppingCriteria(const string & name)419 void RecordTFDataAutotuneStoppingCriteria(const string& name) {
420   tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
421 }
422 
RecordParseDenseFeature(int64 num_features)423 void RecordParseDenseFeature(int64 num_features) {
424   static auto* parse_dense_feature_counter_cell =
425       parse_dense_feature_counter->GetCell();
426   parse_dense_feature_counter_cell->IncrementBy(num_features);
427 }
428 
RecordParseSparseFeature(int64_t num_features)429 void RecordParseSparseFeature(int64_t num_features) {
430   static auto* parse_sparse_feature_counter_cell =
431       parse_sparse_feature_counter->GetCell();
432   parse_sparse_feature_counter_cell->IncrementBy(num_features);
433 }
434 
RecordParseRaggedFeature(int64_t num_features)435 void RecordParseRaggedFeature(int64_t num_features) {
436   static auto* parse_ragged_feature_counter_cell =
437       parse_ragged_feature_counter->GetCell();
438   parse_ragged_feature_counter_cell->IncrementBy(num_features);
439 }
440 
RecordGraphInputTensors(const size_t size)441 void RecordGraphInputTensors(const size_t size) {
442   static auto* graph_run_input_tensor_bytes_cell =
443       graph_run_input_tensor_bytes->GetCell();
444   graph_run_input_tensor_bytes_cell->Add(size);
445 }
446 
RecordGraphOutputTensors(const size_t size)447 void RecordGraphOutputTensors(const size_t size) {
448   static auto* graph_run_output_tensor_bytes_cell =
449       graph_run_output_tensor_bytes->GetCell();
450   graph_run_output_tensor_bytes_cell->Add(size);
451 }
452 
RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica)453 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
454   xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
455       ->IncrementBy(1);
456 }
457 
UpdateGraphExecTime(const uint64 running_time_usecs)458 void UpdateGraphExecTime(const uint64 running_time_usecs) {
459   if (running_time_usecs > 0) {
460     static auto* graph_runs_cell = graph_runs->GetCell();
461     static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
462     static auto* graph_run_time_usecs_histogram_cell =
463         graph_run_time_usecs_histogram->GetCell();
464     graph_runs_cell->IncrementBy(1);
465     graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
466     graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
467   }
468 }
469 
UpdateGraphPendingQueueLength(uint64 len)470 void UpdateGraphPendingQueueLength(uint64 len) {
471   static auto* graph_pending_queue_length_cell =
472       graph_pending_queue_length_histogram->GetCell();
473   graph_pending_queue_length_cell->Add(len);
474 }
475 
UpdateGraphBuildTime(const uint64 running_time_usecs)476 void UpdateGraphBuildTime(const uint64 running_time_usecs) {
477   if (running_time_usecs > 0) {
478     static auto* build_graph_calls_cell = build_graph_calls->GetCell();
479     static auto* build_graph_time_usecs_cell =
480         build_graph_time_usecs->GetCell();
481     build_graph_calls_cell->IncrementBy(1);
482     build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
483   }
484 }
485 
UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs)486 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
487   if (distribution_time_usecs > 0) {
488     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
489         distribution_time_usecs);
490   }
491 }
492 
UpdateXlaCompilationTime(const uint64 compilation_time_usecs)493 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
494   if (compilation_time_usecs > 0) {
495     static auto* xla_compilations_cell = xla_compilations->GetCell();
496     static auto* xla_compilation_time_usecs_cell =
497         xla_compilation_time_usecs->GetCell();
498     xla_compilations_cell->IncrementBy(1);
499     xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
500   }
501 }
502 
UpdateBfcAllocatorDelayTime(const uint64 delay_usecs)503 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
504   static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
505   if (delay_usecs > 0) {
506     bfc_allocator_delay_cell->IncrementBy(delay_usecs);
507   }
508 }
509 
RecordUnusedOutput(const string & op_name)510 void RecordUnusedOutput(const string& op_name) {
511   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
512 }
513 
IncrementTestCounter(const string & name,const string & label)514 void IncrementTestCounter(const string& name, const string& label) {
515   test_counters->GetCell(name, label)->IncrementBy(1);
516 }
517 
TestCounter(const string & name,const string & label)518 const monitoring::CounterCell* TestCounter(const string& name,
519                                            const string& label) {
520   return test_counters->GetCell(name, label);
521 }
522 
TestDelta(const string & name,const string & label)523 TestDelta::TestDelta(const string& name, const string& label)
524     : cell_(TestCounter(name, label)) {
525   Reset();
526 }
527 
Reset()528 void TestDelta::Reset() { last_value_ = cell_->value(); }
529 
Get()530 int64 TestDelta::Get() { return cell_->value() - last_value_; }
531 
UpdateTfMlirGraphOptimizationPassStateCounter(const std::string & pass_state,const std::string & processing_state)532 void UpdateTfMlirGraphOptimizationPassStateCounter(
533     const std::string& pass_state, const std::string& processing_state) {
534   static auto* metric = monitoring::Counter<2>::New(
535       "/tensorflow/core/tf_mlir_update_graph_optimization_pass_state_counter",
536       "Tracks changes in a graph's UpdateTfMlirGraphOptimizationPassState",
537       "PassState", "ProcessingState");
538 
539   metric->GetCell(pass_state, processing_state)->IncrementBy(1);
540 }
541 
UpdateTfMlirBridgeFirstPhaseCounter(const std::string & device_type,const std::string & bridge_version,bool fallback_enabled,const std::string & result)542 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
543                                          const std::string& bridge_version,
544                                          bool fallback_enabled,
545                                          const std::string& result) {
546   static auto* metric = monitoring::Counter<4>::New(
547       "/tensorflow/core/tf_mlir_bridge_first_phase_count",
548       "Tracks processing state in first phase of mlir bridge", "device",
549       "version", "fallback", "result");
550   std::string fallback_status =
551       fallback_enabled ? "fallback_enabled" : "fallback_disabled";
552   metric->GetCell(device_type, bridge_version, fallback_status, result)
553       ->IncrementBy(1);
554 }
555 
UpdateTpuErrorCounter(const string & op,const string & error_type)556 void UpdateTpuErrorCounter(const string& op, const string& error_type) {
557   tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
558 }
559 
UpdateEagerClientErrorCounter(const string & error_source,const string & error_type)560 void UpdateEagerClientErrorCounter(const string& error_source,
561                                    const string& error_type) {
562   eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1);
563 }
564 
UpdateTfMlirBridgeGraphAnalysisPerOp(const std::string & op_name,const std::string & construction_context,bool is_single_core_inference_mode,const std::string & num_replicas,const std::string & num_cores_per_replica,const std::string & use_tpu,const std::string & allow_soft_placement,const std::string & use_spmd_for_xla_partitioning,const std::string & unsupported_reason,bool has_unsupported_features)565 void UpdateTfMlirBridgeGraphAnalysisPerOp(
566     const std::string& op_name, const std::string& construction_context,
567     bool is_single_core_inference_mode, const std::string& num_replicas,
568     const std::string& num_cores_per_replica, const std::string& use_tpu,
569     const std::string& allow_soft_placement,
570     const std::string& use_spmd_for_xla_partitioning,
571     const std::string& unsupported_reason, bool has_unsupported_features) {
572   static auto* metric = monitoring::Counter<10>::New(
573       "/tensorflow/core/tf_mlir_bridge_graph_analysis_per_op",
574       "Tracks processing state per op in first phase of mlir bridge", "op_name",
575       "construction_context", "is_single_core_inference_mode", "num_replicas",
576       "num_cores_per_replica", "use_tpu", "allow_soft_placement",
577       "use_spmd_for_xla_partitioning", "unsupported_reason",
578       "has_unsupported_features");
579 
580   metric
581       ->GetCell(op_name, construction_context,
582                 is_single_core_inference_mode ? "Yes" : "No", num_replicas,
583                 num_cores_per_replica, use_tpu, allow_soft_placement,
584                 use_spmd_for_xla_partitioning, unsupported_reason,
585                 has_unsupported_features ? "Yes" : "No")
586       ->IncrementBy(1);
587 }
588 
589 }  // namespace metrics
590 }  // namespace tensorflow
591