1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/cc/saved_model/metrics.h"
17
18 #include <string>
19
20 #include "tensorflow/core/lib/monitoring/counter.h"
21 #include "tensorflow/core/lib/monitoring/sampler.h"
22
23 namespace tensorflow {
24 namespace metrics {
25
26 namespace {
27
28 // Counter that tracks total number and `write_version` of SavedModels written.
29 auto* saved_model_write_counter = monitoring::Counter<1>::New(
30 "/tensorflow/core/saved_model/write/count",
31 "The number of SavedModels successfully written.", "write_version");
32
33 // Counter that tracks total number and `write_version` of SavedModels read.
34 auto* saved_model_read_counter = monitoring::Counter<1>::New(
35 "/tensorflow/core/saved_model/read/count",
36 "The number of SavedModels successfully loaded.", "write_version");
37
38 // Counter that tracks number of calls for each SavedModel write API. Summing
39 // across "api_label" is not expected to equal the ".../write/count" cell value
40 // because programs can invoke more than one API to save a single SM and
41 // because the API may error out before successfully writing a SM.
42 auto* saved_model_write_api = monitoring::Counter<1>::New(
43 "/tensorflow/core/saved_model/write/api",
44 "The API used to write the SavedModel.", "api_label");
45
46 // Counter that tracks number of calls for each SavedModel read API. Summing
47 // across "api_label" is not expected to equal the ".../read/count" cell value
48 // because programs can invoke more than one API to load a single SM and
49 // because the API may error out before successfully reading a SM.
50 auto* saved_model_read_api = monitoring::Counter<1>::New(
51 "/tensorflow/core/saved_model/read/api",
52 "The API used to load the SavedModel.", "api_label");
53
54 // Distribution of checkpoint write durations.
55 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
56 {
57 "/tensorflow/core/checkpoint/write/write_durations", // Metric name.
58 "Distribution of the wall time duration in microseconds of the "
59 "checkpoint write operation.", // Metric description.
60 "api_label" // Cell label.
61 },
62 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
63 monitoring::Buckets::Exponential(1000, 1.5, 41));
64
65 // Distribution of checkpoint read durations.
66 auto* checkpoint_read_durations = monitoring::Sampler<1>::New(
67 {
68 "/tensorflow/core/checkpoint/read/read_durations", // Metric name.
69 "Distribution of the wall time duration in microseconds of the "
70 "checkpoint read operation.", // Metric description.
71 "api_label" // Cell label.
72 },
73 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
74 monitoring::Buckets::Exponential(1000, 1.5, 41));
75
76 // Counter that accumulates total time elapsed between module import time and
77 // the last successful Checkpoint write prior to job pre-emption or completion.
78 auto* checkpoint_training_time_saved = monitoring::Counter<1>::New(
79 "/tensorflow/core/checkpoint/write/training_time_saved",
80 "Total time in microseconds elapsed between two consecutive write "
81 "operations in a single job or between Checkpoint construction and the "
82 "first write operation.",
83 "api_label");
84
85 } // namespace
86
SavedModelWrite(absl::string_view write_version)87 monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) {
88 return *saved_model_write_counter->GetCell(std::string(write_version));
89 }
90
SavedModelRead(absl::string_view write_version)91 monitoring::CounterCell& SavedModelRead(absl::string_view write_version) {
92 return *saved_model_read_counter->GetCell(std::string(write_version));
93 }
94
SavedModelWriteApi(absl::string_view api_label)95 monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) {
96 return *saved_model_write_api->GetCell(std::string(api_label));
97 }
98
SavedModelReadApi(absl::string_view api_label)99 monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
100 return *saved_model_read_api->GetCell(std::string(api_label));
101 }
102
CheckpointReadDuration(absl::string_view api_label)103 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
104 return *checkpoint_read_durations->GetCell(std::string(api_label));
105 }
106
CheckpointWriteDuration(absl::string_view api_label)107 monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) {
108 return *checkpoint_write_durations->GetCell(std::string(api_label));
109 }
110
TrainingTimeSaved(absl::string_view api_label)111 monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) {
112 return *checkpoint_training_time_saved->GetCell(std::string(api_label));
113 }
114
115 } // namespace metrics
116 } // namespace tensorflow
117