1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
17 #define TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
18
19 #include <map>
20 #include <memory>
21
22 #include "tensorflow/core/framework/summary.pb.h"
23 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
24 #include "tensorflow/core/lib/monitoring/metric_def.h"
25 #include "tensorflow/core/lib/monitoring/types.h"
26 #include "tensorflow/core/platform/env.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/mutex.h"
30 #include "tensorflow/core/platform/stringpiece.h"
31 #include "tensorflow/core/platform/thread_annotations.h"
32 #include "tensorflow/core/platform/types.h"
33
34 namespace tensorflow {
35 namespace monitoring {
36
37 namespace test_util {
38 class CollectionRegistryTestAccess;
39 } // namespace test_util
40
41 namespace internal {
42 class Collector;
43 } // namespace internal
44
45 // Metric implementations would get an instance of this class using the
46 // MetricCollectorGetter in the collection-function lambda, so that their values
47 // can be collected.
48 //
49 // Read the documentation on CollectionRegistry::Register() for more details.
50 //
51 // For example:
52 // auto metric_collector = metric_collector_getter->Get(&metric_def);
53 // metric_collector.CollectValue(some_labels, some_value);
54 // metric_collector.CollectValue(others_labels, other_value);
55 //
56 // This class is NOT thread-safe.
57 template <MetricKind metric_kind, typename Value, int NumLabels>
58 class MetricCollector {
59 public:
60 ~MetricCollector() = default;
61
62 // Collects the value with these labels.
63 void CollectValue(const std::array<string, NumLabels>& labels, Value value);
64
65 private:
66 friend class internal::Collector;
67
MetricCollector(const MetricDef<metric_kind,Value,NumLabels> * const metric_def,const uint64 registration_time_millis,internal::Collector * const collector,PointSet * const point_set)68 MetricCollector(
69 const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
70 const uint64 registration_time_millis,
71 internal::Collector* const collector, PointSet* const point_set)
72 : metric_def_(metric_def),
73 registration_time_millis_(registration_time_millis),
74 collector_(collector),
75 point_set_(point_set) {
76 point_set_->metric_name = string(metric_def->name());
77 }
78
79 const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
80 const uint64 registration_time_millis_;
81 internal::Collector* const collector_;
82 PointSet* const point_set_;
83
84 // This is made copyable because we can't hand out references of this class
85 // from MetricCollectorGetter because this class is templatized, and we need
86 // MetricCollectorGetter not to be templatized and hence MetricCollectorGetter
87 // can't own an instance of this class.
88 };
89
90 // Returns a MetricCollector with the same template parameters as the
91 // metric-definition, so that the values of a metric can be collected.
92 //
93 // The collection-function defined by a metric takes this as a parameter.
94 //
95 // Read the documentation on CollectionRegistry::Register() for more details.
96 class MetricCollectorGetter {
97 public:
98 // Returns the MetricCollector with the same template parameters as the
99 // metric_def.
100 template <MetricKind metric_kind, typename Value, int NumLabels>
101 MetricCollector<metric_kind, Value, NumLabels> Get(
102 const MetricDef<metric_kind, Value, NumLabels>* const metric_def);
103
104 private:
105 friend class internal::Collector;
106
MetricCollectorGetter(internal::Collector * const collector,const AbstractMetricDef * const allowed_metric_def,const uint64 registration_time_millis)107 MetricCollectorGetter(internal::Collector* const collector,
108 const AbstractMetricDef* const allowed_metric_def,
109 const uint64 registration_time_millis)
110 : collector_(collector),
111 allowed_metric_def_(allowed_metric_def),
112 registration_time_millis_(registration_time_millis) {}
113
114 internal::Collector* const collector_;
115 const AbstractMetricDef* const allowed_metric_def_;
116 const uint64 registration_time_millis_;
117 };
118
119 // A collection registry for metrics.
120 //
121 // Metrics are registered here so that their state can be collected later and
122 // exported.
123 //
124 // This class is thread-safe.
125 class CollectionRegistry {
126 public:
127 ~CollectionRegistry() = default;
128
129 // Returns the default registry for the process.
130 //
131 // This registry belongs to this library and should never be deleted.
132 static CollectionRegistry* Default();
133
134 using CollectionFunction = std::function<void(MetricCollectorGetter getter)>;
135
136 // Registers the metric and the collection-function which can be used to
137 // collect its values. Returns a Registration object, which when upon
138 // destruction would cause the metric to be unregistered from this registry.
139 //
140 // IMPORTANT: Delete the handle before the metric-def is deleted.
141 //
142 // Example usage;
143 // CollectionRegistry::Default()->Register(
144 // &metric_def,
145 // [&](MetricCollectorGetter getter) {
146 // auto metric_collector = getter.Get(&metric_def);
147 // for (const auto& cell : cells) {
148 // metric_collector.CollectValue(cell.labels(), cell.value());
149 // }
150 // });
151 class RegistrationHandle;
152 std::unique_ptr<RegistrationHandle> Register(
153 const AbstractMetricDef* metric_def,
154 const CollectionFunction& collection_function)
155 TF_LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
156
157 // Options for collecting metrics.
158 struct CollectMetricsOptions {
CollectMetricsOptionsCollectMetricsOptions159 CollectMetricsOptions() {}
160 bool collect_metric_descriptors = true;
161 };
162 // Goes through all the registered metrics, collects their definitions
163 // (optionally) and current values and returns them in a standard format.
164 std::unique_ptr<CollectedMetrics> CollectMetrics(
165 const CollectMetricsOptions& options) const;
166
167 private:
168 friend class test_util::CollectionRegistryTestAccess;
169 friend class internal::Collector;
170
171 CollectionRegistry(Env* env);
172
173 // Unregisters the metric from this registry. This is private because the
174 // public interface provides a Registration handle which automatically calls
175 // this upon destruction.
176 void Unregister(const AbstractMetricDef* metric_def) TF_LOCKS_EXCLUDED(mu_);
177
178 // TF environment, mainly used for timestamping.
179 Env* const env_;
180
181 mutable mutex mu_;
182
183 // Information required for collection.
184 struct CollectionInfo {
185 const AbstractMetricDef* const metric_def;
186 CollectionFunction collection_function;
187 uint64 registration_time_millis;
188 };
189 std::map<StringPiece, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
190
191 TF_DISALLOW_COPY_AND_ASSIGN(CollectionRegistry);
192 };
193
194 ////
195 // Implementation details follow. API readers may skip.
196 ////
197
198 class CollectionRegistry::RegistrationHandle {
199 public:
RegistrationHandle(CollectionRegistry * const export_registry,const AbstractMetricDef * const metric_def)200 RegistrationHandle(CollectionRegistry* const export_registry,
201 const AbstractMetricDef* const metric_def)
202 : export_registry_(export_registry), metric_def_(metric_def) {}
203
~RegistrationHandle()204 ~RegistrationHandle() { export_registry_->Unregister(metric_def_); }
205
206 private:
207 CollectionRegistry* const export_registry_;
208 const AbstractMetricDef* const metric_def_;
209 };
210
211 namespace internal {
212
213 template <typename Value>
214 void CollectValue(Value value, Point* point);
215
216 template <>
CollectValue(int64 value,Point * const point)217 inline void CollectValue(int64 value, Point* const point) {
218 point->value_type = ValueType::kInt64;
219 point->int64_value = value;
220 }
221
222 template <>
CollectValue(string value,Point * const point)223 inline void CollectValue(string value, Point* const point) {
224 point->value_type = ValueType::kString;
225 point->string_value = std::move(value);
226 }
227
228 template <>
CollectValue(bool value,Point * const point)229 inline void CollectValue(bool value, Point* const point) {
230 point->value_type = ValueType::kBool;
231 point->bool_value = value;
232 }
233
234 template <>
CollectValue(HistogramProto value,Point * const point)235 inline void CollectValue(HistogramProto value, Point* const point) {
236 point->value_type = ValueType::kHistogram;
237 // This is inefficient. If and when we hit snags, we can change the API to do
238 // this more efficiently.
239 point->histogram_value = std::move(value);
240 }
241
242 template <>
CollectValue(Percentiles value,Point * const point)243 inline void CollectValue(Percentiles value, Point* const point) {
244 point->value_type = ValueType::kPercentiles;
245 point->percentiles_value = std::move(value);
246 }
247
248 // Used by the CollectionRegistry class to collect all the values of all the
249 // metrics in the registry. This is an implementation detail of the
250 // CollectionRegistry class, please do not depend on this.
251 //
252 // This cannot be a private nested class because we need to forward declare this
253 // so that the MetricCollector and MetricCollectorGetter classes can be friends
254 // with it.
255 //
256 // This class is thread-safe.
257 class Collector {
258 public:
Collector(const uint64 collection_time_millis)259 Collector(const uint64 collection_time_millis)
260 : collected_metrics_(new CollectedMetrics()),
261 collection_time_millis_(collection_time_millis) {}
262
263 template <MetricKind metric_kind, typename Value, int NumLabels>
GetMetricCollector(const MetricDef<metric_kind,Value,NumLabels> * const metric_def,const uint64 registration_time_millis,internal::Collector * const collector)264 MetricCollector<metric_kind, Value, NumLabels> GetMetricCollector(
265 const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
266 const uint64 registration_time_millis,
267 internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
268 auto* const point_set = [&]() {
269 mutex_lock l(mu_);
270 return collected_metrics_->point_set_map
271 .insert(std::make_pair(string(metric_def->name()),
272 std::unique_ptr<PointSet>(new PointSet())))
273 .first->second.get();
274 }();
275 return MetricCollector<metric_kind, Value, NumLabels>(
276 metric_def, registration_time_millis, collector, point_set);
277 }
278
collection_time_millis()279 uint64 collection_time_millis() const { return collection_time_millis_; }
280
281 void CollectMetricDescriptor(const AbstractMetricDef* const metric_def)
282 TF_LOCKS_EXCLUDED(mu_);
283
284 void CollectMetricValues(
285 const CollectionRegistry::CollectionInfo& collection_info);
286
287 std::unique_ptr<CollectedMetrics> ConsumeCollectedMetrics()
288 TF_LOCKS_EXCLUDED(mu_);
289
290 private:
291 mutable mutex mu_;
292 std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
293 const uint64 collection_time_millis_;
294
295 TF_DISALLOW_COPY_AND_ASSIGN(Collector);
296 };
297
298 // Write the timestamps for the point based on the MetricKind.
299 //
300 // Gauge metrics will have start and end timestamps set to the collection time.
301 //
302 // Cumulative metrics will have the start timestamp set to the time when the
303 // collection function was registered, while the end timestamp will be set to
304 // the collection time.
305 template <MetricKind kind>
306 void WriteTimestamps(const uint64 registration_time_millis,
307 const uint64 collection_time_millis, Point* const point);
308
309 template <>
310 inline void WriteTimestamps<MetricKind::kGauge>(
311 const uint64 registration_time_millis, const uint64 collection_time_millis,
312 Point* const point) {
313 point->start_timestamp_millis = collection_time_millis;
314 point->end_timestamp_millis = collection_time_millis;
315 }
316
317 template <>
318 inline void WriteTimestamps<MetricKind::kCumulative>(
319 const uint64 registration_time_millis, const uint64 collection_time_millis,
320 Point* const point) {
321 point->start_timestamp_millis = registration_time_millis;
322 // There's a chance that the clock goes backwards on the same machine, so we
323 // protect ourselves against that.
324 point->end_timestamp_millis =
325 registration_time_millis < collection_time_millis
326 ? collection_time_millis
327 : registration_time_millis;
328 }
329
330 } // namespace internal
331
332 template <MetricKind metric_kind, typename Value, int NumLabels>
CollectValue(const std::array<string,NumLabels> & labels,Value value)333 void MetricCollector<metric_kind, Value, NumLabels>::CollectValue(
334 const std::array<string, NumLabels>& labels, Value value) {
335 point_set_->points.emplace_back(new Point());
336 auto* const point = point_set_->points.back().get();
337 const std::vector<string> label_descriptions =
338 metric_def_->label_descriptions();
339 point->labels.reserve(NumLabels);
340 for (int i = 0; i < NumLabels; ++i) {
341 point->labels.push_back({});
342 auto* const label = &point->labels.back();
343 label->name = label_descriptions[i];
344 label->value = labels[i];
345 }
346 internal::CollectValue(std::move(value), point);
347 internal::WriteTimestamps<metric_kind>(
348 registration_time_millis_, collector_->collection_time_millis(), point);
349 }
350
351 template <MetricKind metric_kind, typename Value, int NumLabels>
Get(const MetricDef<metric_kind,Value,NumLabels> * const metric_def)352 MetricCollector<metric_kind, Value, NumLabels> MetricCollectorGetter::Get(
353 const MetricDef<metric_kind, Value, NumLabels>* const metric_def) {
354 if (allowed_metric_def_ != metric_def) {
355 LOG(FATAL) << "Expected collection for: " << allowed_metric_def_->name()
356 << " but instead got: " << metric_def->name();
357 }
358
359 return collector_->GetMetricCollector(metric_def, registration_time_millis_,
360 collector_);
361 }
362
363 class Exporter {
364 public:
~Exporter()365 virtual ~Exporter() {}
366 virtual void PeriodicallyExportMetrics() = 0;
367 virtual void ExportMetrics() = 0;
368 };
369
370 namespace exporter_registration {
371
372 class ExporterRegistration {
373 public:
ExporterRegistration(Exporter * exporter)374 explicit ExporterRegistration(Exporter* exporter) : exporter_(exporter) {
375 exporter_->PeriodicallyExportMetrics();
376 }
377
378 private:
379 Exporter* exporter_;
380 };
381
382 } // namespace exporter_registration
383
384 #define REGISTER_TF_METRICS_EXPORTER(exporter) \
385 REGISTER_TF_METRICS_EXPORTER_UNIQ_HELPER(__COUNTER__, exporter)
386
387 #define REGISTER_TF_METRICS_EXPORTER_UNIQ_HELPER(ctr, exporter) \
388 REGISTER_TF_METRICS_EXPORTER_UNIQ(ctr, exporter)
389
390 #define REGISTER_TF_METRICS_EXPORTER_UNIQ(ctr, exporter) \
391 static ::tensorflow::monitoring::exporter_registration::ExporterRegistration \
392 exporter_registration_##ctr(new exporter())
393
394 } // namespace monitoring
395 } // namespace tensorflow
396
397 #endif // TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
398