1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 18 19 #include "absl/container/flat_hash_map.h" 20 #include "tensorflow/core/framework/dataset_options.pb.h" 21 #include "tensorflow/core/lib/monitoring/counter.h" 22 #include "tensorflow/core/lib/monitoring/gauge.h" 23 #include "tensorflow/core/platform/statusor.h" 24 #include "tensorflow/core/platform/types.h" 25 26 namespace tensorflow { 27 namespace metrics { 28 29 // Records that a tf.data.Dataset executed by the program used autotuning. 30 // 31 // The `name` argument identifies the Dataset type (e.g. "ParallelMap"). 32 void RecordTFDataAutotune(const string& name); 33 34 // Returns a counter that can be used to record the number of bytes produced by 35 // a tf.data.Dataset. 36 // 37 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 38 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name); 39 40 // Returns a counter that can be used to record the number of bytes produced by 41 // a tf.data.Dataset. 42 // 43 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 44 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name); 45 46 // Returns a counter than can be used to record the number of bytes read from 47 // the filesystem by a tf.data.Dataset source. 48 // 49 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). 50 // 51 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter? 52 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name); 53 54 // Returns a counter than can be used to record the number of elements produced 55 // by a tf.data.Dataset. 56 // 57 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 58 monitoring::CounterCell* GetTFDataElementsCounter(const string& name); 59 60 // Returns a gauge than can be used to record the performance model information. 61 // 62 // The `id` argument represents the (unique) model ID. 63 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge( 64 const string& id); 65 66 // Records the number of bytes fetched from tf.data.Dataset iterator. 67 void RecordTFDataBytesFetched(int64_t num_bytes); 68 69 // Records the number of times tf.data experiment is applied to input pipelines. 70 void RecordTFDataExperiment(const string& name); 71 72 // Records the time (in microseconds) spent in a single invocation of 73 // `ItertatorResource::GetNext()`. 74 void RecordTFDataGetNextDuration(uint64 duration_us); 75 76 // Records the number of times each tf.data fingerprint is used 77 // to measure duplicate pre-processing. 78 // 79 // The `name` argument identifies the Dataset graph fingerprint, 80 // created using GraphHash(). 81 void RecordTFDataFingerprint(const string& name); 82 83 // Records the time (in microseconds) during which `IteratorResource` was busy 84 // processing at least one `GetNext()` request. 85 void RecordTFDataIteratorBusy(uint64 duration_us); 86 87 // Records the time (in microseconds) between `IteratorResource` receiving the 88 // first `GetNext()` request and responding to the last `GetNext()` request. 89 void RecordTFDataIteratorLifetime(uint64 duration_us); 90 91 // Records the number of independent graph changes resulting from the 92 // application of a tf.data optimization. 93 // 94 // The `name` argument identifies the optimization (e.g. "noop_elimination"). 95 void RecordTFDataOptimization(const string& name, int64_t num_changes); 96 97 // Records that a tf.data service worker has been created. 98 void RecordTFDataServiceWorkerCreated(); 99 100 // Records the file name read by a tf.data Dataset. 101 // 102 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). 103 void RecordTFDataFilename(const string& name, const string& filename); 104 105 // Records statistics of tf.data auto sharding. 106 // 107 // The `id` is a unique identifier of the input pipeline. The `policy` 108 // identifies the auto-sharding policy used, the `num_workers` identifies the 109 // number of workers, and `num_replicas` identifies the number of replicas. 110 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy, 111 int64 num_workers, int64 num_replicas); 112 113 // Records parsing of dense tensor features. 114 void RecordParseDenseFeature(int64_t num_features); 115 116 // Records parsing of sparse tensor features. 117 void RecordParseSparseFeature(int64_t num_features); 118 119 // Records parsing of ragged tensor features. 120 void RecordParseRaggedFeature(int64_t num_features); 121 122 // Records the size of input/output tensors in bytes. 123 void RecordGraphInputTensors(const size_t size); 124 void RecordGraphOutputTensors(const size_t size); 125 126 // Records the number of cores requested by graphs with XLA SPMD enabled. 127 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica); 128 129 void UpdateGraphExecTime(const uint64 running_time_usecs); 130 void UpdateGraphPendingQueueLength(uint64 len); 131 132 // Records that one output of an op of type `op_name` was unused. 133 void RecordUnusedOutput(const string& op_name); 134 135 // Updates the metrics stored about time spent building graphs. 136 // 137 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of 138 // the full graph, induced by a set of options. In particular, these options 139 // include the feeds and fetches requested. 140 // 141 // This includes time spent: 142 // * optimizing the graphs with Grappler 143 // * pruning the sub-graph (unless the place_pruned_graph option is set) 144 // 145 // When executing eagerly, this will not record any activity. 146 // 147 // TODO(jtkeeling): Should we record building/optimizing tf.functions? 148 void UpdateGraphBuildTime(const uint64 running_time_usecs); 149 150 // Updates the metrics stored about graph optimizations. 151 void UpdateGraphOptimizationPassTime(const string& pass_name, 152 const uint64 running_time_usecs); 153 void UpdateGrapplerPassTime(const string& pass_name, 154 const uint64 running_time_usecs); 155 156 // Updates metrics for time to distribute variables to all TPU hosts. 157 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs); 158 159 // Updates the metrics stored about time XLA spents compiling graphs. 160 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs); 161 162 // Updates the metrics stored about time BFC allocator spents during delay. 163 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs); 164 165 } // namespace metrics 166 } // namespace tensorflow 167 168 #endif // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 169