1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_ 16 #define TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_ 17 18 #include <memory> 19 #include <unordered_map> 20 #include <vector> 21 #include "tensorflow/core/framework/step_stats.pb.h" 22 #include "tensorflow/core/framework/tensor_reference.h" 23 #include "tensorflow/core/lib/gtl/inlined_vector.h" 24 #include "tensorflow/core/platform/env.h" 25 #include "tensorflow/core/platform/mutex.h" 26 #include "tensorflow/core/platform/thread_annotations.h" 27 #include "tensorflow/core/platform/types.h" 28 29 namespace tensorflow { 30 31 class Allocator; 32 class AllocatorMemoryUsed; 33 class CostModelManager; 34 class Graph; 35 class Node; 36 class NodeExecStats; 37 class OpKernelContext; 38 class StepStats; 39 class StepStatsCollector; 40 class Tensor; 41 class TrackingAllocator; 42 43 // Statistics collection interface for individual node execution. 44 // 45 // See `NodeExecStatsWrapper` for a concrete implementation of this interface 46 // that interfaces with the `Session` layer. 47 class NodeExecStatsInterface { 48 public: ~NodeExecStatsInterface()49 virtual ~NodeExecStatsInterface() {} 50 51 // Called when the statistics collection for the node has finished. Once this 52 // method is called, the caller should not make assumptions about the validity 53 // of this object. 54 virtual void Done(const string& device) = 0; 55 56 // Called immediately after this node starts being processed by the executor. 57 virtual void RecordExecutorStarted() = 0; 58 59 // Called immediately before this node's `Compute()` or `ComputeAsync()` 60 // method is called. 61 virtual void RecordComputeStarted() = 0; 62 63 // Called immediately after this node's `Compute()` method returned (or, for 64 // asynchronous operations, the callback passed to its `ComputeAsync()` method 65 // was called). 66 virtual void RecordComputeEnded() = 0; 67 68 // Called immediately after this executor finishes processing this node. 69 virtual void RecordExecutorEnded() = 0; 70 71 // Returns `true` if this object should track memory allocations. 72 virtual bool TrackAllocations() const = 0; 73 74 // Records information about the memory allocated during the execution of this 75 // node. 76 // 77 // Takes ownership of any `TrackingAllocator` objects stored in `ctx`. 78 virtual void SetMemory(OpKernelContext* ctx) = 0; 79 80 // Records information about the tensor produced by this node at the given 81 // output slot. 82 virtual void SetOutput(int slot, const Tensor* tensor) = 0; 83 84 // Records information about the tensors that were accessed during the 85 // execution of this node. 86 virtual void SetReferencedTensors(const TensorReferenceVector& tensors) = 0; 87 88 // Records the absolute time in nanoseconds at which this node became 89 // runnable (i.e. was scheduled for execution). 90 virtual void SetScheduled(int64 nanos) = 0; 91 }; 92 93 // Wraps NodeExecStats and adds allocation to it. 94 class NodeExecStatsWrapper : public NodeExecStatsInterface { 95 public: 96 // Does not take ownership of `node` or `step_stats_collector`. 97 NodeExecStatsWrapper(const Node* node, 98 StepStatsCollector* step_stats_collector); 99 100 // Takes ownership of 'stats' but not `node` or `step_stats_collector`. 101 NodeExecStatsWrapper(std::unique_ptr<NodeExecStats> stats, const Node* node, 102 StepStatsCollector* step_stats_collector); 103 104 // Destructor calls Finalize() to release the TrackingAllocators. ~NodeExecStatsWrapper()105 ~NodeExecStatsWrapper() { Finalize(); } 106 107 void Done(const string& device) override; 108 void RecordExecutorStarted() override; 109 void RecordComputeStarted() override; 110 void RecordComputeEnded() override; 111 void RecordExecutorEnded() override; TrackAllocations()112 bool TrackAllocations() const override { return true; } 113 void SetMemory(OpKernelContext* ctx) override; 114 void SetOutput(int slot, const Tensor* tensor) override; 115 void SetReferencedTensors(const TensorReferenceVector& tensors) override; 116 void SetScheduled(int64 nanos) override; 117 118 private: 119 friend class StepStatsCollector; 120 stats()121 NodeExecStats* stats() { return stats_.get(); } 122 123 // Populates stats_ and releases TrackingAllocator. 124 void Finalize(); 125 126 // Does not take ownership of the `allocator`. 127 // Takes ownership of `tracking_allocator`. 128 void AddAllocation(Allocator* allocator, 129 TrackingAllocator* tracking_allocator); 130 131 gtl::InlinedVector<std::pair<AllocatorMemoryUsed*, TrackingAllocator*>, 2> 132 allocations_; 133 std::unique_ptr<NodeExecStats> stats_; 134 const Node* const node_; // Not owned. 135 StepStatsCollector* const step_stats_collector_; // Not owned. 136 }; 137 138 // Statistics collection interface for step execution. 139 // 140 // See `StepStatsCollector` for a concrete implementation of this interface 141 // that interfaces with the `Session` layer. 142 class StepStatsCollectorInterface { 143 public: ~StepStatsCollectorInterface()144 virtual ~StepStatsCollectorInterface() {} 145 146 // Creates an instance of `NodeExecStatsInterface` that should be used for 147 // collecting statistics about individual node execution. 148 virtual NodeExecStatsInterface* CreateNodeExecStats(const Node* node) = 0; 149 150 // Generates a string reporting the currently used memory based 151 // on ResourceExhausted OOM `err` message. 152 // `err` message needs to contain device name and allocator name, e.g.: 153 // "ResourceExhaustedError: OOM when allocating tensor ... 154 // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc" 155 virtual string ReportAllocsOnResourceExhausted(const string& err) = 0; 156 }; 157 158 // StepStatsCollector manages the collection of a StepStats object. 159 // The StepStats object holds multiple DeviceStats. 160 // Each DeviceStats object holds multiple NodeExecStats. 161 class StepStatsCollector : public StepStatsCollectorInterface { 162 public: 163 // Does not take ownership of `step_stats`. 164 explicit StepStatsCollector(StepStats* step_stats); 165 166 // BuildCostModel builds or updates a CostModel managed by cost_model_manager, 167 // using the currently collected DeviceStats associated with the devices in 168 // device_map. 169 void BuildCostModel( 170 CostModelManager* cost_model_manager, 171 const std::unordered_map<string, const Graph*>& device_map); 172 173 // Saves node statistics to the DeviceStats object associated with device. 174 // Should be called before Finalize. 175 void Save(const string& device, NodeExecStats* node_stats_pb); 176 void Save(const string& device, NodeExecStatsWrapper* node_stats); 177 178 // Saves thread name. 179 void SaveThreadName(const string& device, const uint32 thread_id, 180 const string& thread_name); 181 182 NodeExecStatsInterface* CreateNodeExecStats(const Node* node) override; 183 string ReportAllocsOnResourceExhausted(const string& err) override; 184 185 // The following 2 Finalize methods populate the StepStats passed 186 // from the constructor. Calling it more than once won't have any effect. 187 // User shouldn't call Save() methods after Finalize. 188 void Finalize(); 189 // swaps the content of StepStats* from constructor with 'ss'. 190 void FinalizeAndSwap(StepStats* step_stats); 191 192 private: 193 // TODO(suharshs): Make this configurable if its not possible to find a value 194 // that works for all cases. 195 static const uint64 kMaxCollectedNodes = 1 << 20; 196 197 typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector; 198 typedef std::unordered_map<uint32, string> ThreadNamesMap; 199 200 void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_); 201 202 mutex mu_; 203 bool finalized_ GUARDED_BY(mu_); 204 std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_); 205 std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_); 206 StepStats* step_stats_ GUARDED_BY(mu_); 207 uint64 collected_nodes_ GUARDED_BY(mu_) = 0; 208 }; 209 210 } // namespace tensorflow 211 212 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_ 213