1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ 17 #define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ 18 19 #include <deque> 20 21 #include "absl/container/flat_hash_map.h" 22 #include "tensorflow/core/framework/tensor.h" 23 #include "tensorflow/core/lib/core/status.h" 24 #include "tensorflow/core/lib/io/record_writer.h" 25 #include "tensorflow/core/platform/env.h" 26 #include "tensorflow/core/platform/macros.h" 27 #include "tensorflow/core/platform/types.h" 28 #include "tensorflow/core/protobuf/debug_event.pb.h" 29 30 namespace tensorflow { 31 namespace tfdbg { 32 33 // The set of files generated by a debugged TensorFlow program. 34 enum DebugEventFileType { 35 METADATA, 36 SOURCE_FILES, 37 STACK_FRAMES, 38 GRAPHS, 39 EXECUTION, 40 GRAPH_EXECUTION_TRACES, 41 }; 42 43 // Helper class for DebugEventsWriter. 44 // This class manages the writing of data to a single TFRecord file. 45 // Each object of the DebugEventsWriter class below involves multiple 46 // TFRecord files, and hence utilizes multiple objects of this helper class. 47 class SingleDebugEventFileWriter { 48 public: 49 explicit SingleDebugEventFileWriter(const string& file_path); 50 51 Status Init(); 52 53 void WriteSerializedDebugEvent(tensorflow::StringPiece debug_event_str); 54 55 Status Flush(); 56 Status Close(); 57 58 const string FileName(); 59 60 private: 61 Env* env_; 62 const string file_path_; 63 std::atomic_int_fast32_t num_outstanding_events_; 64 65 std::unique_ptr<WritableFile> writable_file_; 66 std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_); 67 mutex writer_mu_; 68 }; 69 70 // The DebugEvents writer class. 71 class DebugEventsWriter { 72 public: 73 #ifndef SWIG 74 // Prefix of version string present in the first entry of every event file. 75 // Default size of each circular buffer (unit: number of DebugEvent protos). 76 static constexpr const int64 kDefaultCyclicBufferSize = 1000; 77 78 static constexpr const char* kFileNamePrefix = "tfdbg_events"; 79 static constexpr const char* kMetadataSuffix = "metadata"; 80 static constexpr const char* kSourceFilesSuffix = "source_files"; 81 static constexpr const char* kStackFramesSuffix = "stack_frames"; 82 static constexpr const char* kGraphsSuffix = "graphs"; 83 static constexpr const char* kExecutionSuffix = "execution"; 84 static constexpr const char* kGraphExecutionTracesSuffix = 85 "graph_execution_traces"; 86 87 static constexpr const char* kVersionPrefix = "debug.Event:"; 88 static constexpr const int kCurrentFormatVersion = 1; 89 #endif 90 91 // Get the DebugEventsWriter for the given dump_root. 92 // For a given dump_root value, it is a singleton. tfdbg event files come in 93 // sets of six. The singleton pattern avoids storing multiple sets in a single 94 // folder, which might cause confusion. 95 // 96 // If an instance of DebugEventsWriter has already been created at a 97 // `dump_root`, calling this method with the same `dump_root` will return 98 // the existing instance. 99 // 100 // Args: 101 // dump_root: Dump root directory. If it doesn't exist, will be created. 102 // tfdbg_run_id: Debugging run ID of the writer. 103 // circular_buffer_size: Circular buffer size (in number of DebugEvent 104 // protos). If set to a value <=0, will abolish the circular-buffer 105 // behavior. 106 // Returns: 107 // A pointer to a DebugEventsWriter object: a per-dump_root singleton. 108 static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root, 109 const string& tfdbg_run_id, 110 int64 circular_buffer_size); 111 // Look up existing events writer by dump_root. 112 // If no DebugEventsWriter has been created at the dump_root, a non-OK 113 // Status will be returned. Else an OK status will be returned, with 114 // the pointer to the existing instance provided by reference. 115 static Status LookUpDebugEventsWriter( 116 const string& dump_root, DebugEventsWriter** debug_events_writer); 117 ~DebugEventsWriter(); 118 119 // Sets the debug event filenames and opens file for writing. 120 // All files (see the DebugEventFileType enum) share the same prefix and 121 // differ only in their suffixes. If not called by user, will be invoked 122 // automatically by a call to FileName() or any of the Write*() methods(). 123 // Idempotent: if the metadata file exists and is open, this is a no-op. 124 // If on the other hand the file was opened, but has since disappeared (e.g. 125 // deleted by another process), this will open a new file. 126 Status Init(); 127 128 // The four DebugEvent fields below are written _without_ the circular 129 // buffer. Source file contents are written to the *.source_files file. 130 // Takes ownership of source_file. 131 Status WriteSourceFile(SourceFile* source_file); 132 // Stack frames are written to the *.code_locations file. 133 // Takes ownership of stack_frame_with_id. 134 Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id); 135 // Graph op creation events are written to the *.graphs file. 136 // Takes ownership of graph_op_creation. 137 Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation); 138 // Debugged graphs are written to the *.graphs file. 139 // Takes ownership of debugged_graph. 140 Status WriteDebuggedGraph(DebuggedGraph* debugged_graph); 141 142 // The two DebugEvent fields below are written to the circular buffer 143 // and saved to disk only at the FlushExecutionFiles() call. 144 // Execution events (eager execution of an op or a tf.function) are written 145 // to the *.execution file. Takes ownership of execution. 146 Status WriteExecution(Execution* execution); 147 // Graph execution traces (graph-internal tensor values or their summaries) 148 // are written to the *.graph_execution_traces file. 149 // Takes ownership of graph_execution_trace. 150 Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace); 151 152 // Write a graph execution trace without using a protocol buffer. 153 // Instead, pass the raw values related to the graph execution trace. 154 // Args: 155 // tfdbg_context_id: A unique ID for the context of interest, e.g., a 156 // concreted compiled tf.function that the op of interest belongs to. 157 // op_name: Name of the op that this graph execution trace is concerned 158 // with. Applicable only to the single-tensor trace case. For cases in 159 // which the trace concerns multiple tensors, this is an empty string. 160 // output_slot: Output slot index of the op that this trace is concerned 161 // with. 162 // tensor_debug_mode: An integer that represents the tensor-debug mode 163 // enum. tensor_value: The value of the tensor that describes the 164 // tensor(s) 165 // that this trace is concerned with. The semantics of this tensor value 166 // depends on the value of `tensor_debug_mode`. 167 Status WriteGraphExecutionTrace(const string& tfdbg_context_id, 168 const string& device_name, 169 const string& op_name, int32 output_slot, 170 int32 tensor_debug_mode, 171 const Tensor& tensor_value); 172 173 // Writes a serialized DebugEvent to one of the debug-events files 174 // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES 175 // and GRAPHS files. 176 // NOTE: Actually used in the Python binding, to avoid overhead of 177 // serializing and parsing protos at the language interface. 178 void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str, 179 DebugEventFileType type); 180 181 // Writes a serialized DebugEvent to one of the debug-events files 182 // concerned with the execution-related events: the EXECUTION and 183 // GRAPH_EXECUTION_TRACES files. This involves the cyclic-buffer behavior if 184 // circular_buffer_size is configured to be >0. 185 // NOTE: Actually used in the Python binding, to avoid overhead of 186 // serializing and parsing protos at the language interface. 187 void WriteSerializedExecutionDebugEvent(const string& debug_event_str, 188 DebugEventFileType type); 189 190 // Given name of the device, retrieve a unique integer ID. As a side effect, 191 // if this is the first time this object encounters the device name, 192 // writes a DebuggedDevice proto to the .graphs file in the file set. 193 int RegisterDeviceAndGetId(const string& device_name); 194 195 // EventWriter automatically flushes and closes on destruction, but 196 // this method is provided for users who want to write to disk sooner 197 // and/or check for success. 198 // FlushNonExecutionFiles() pushes outstanding DebugEvents not written 199 // events to the circular buffer to their respective files. 200 Status FlushNonExecutionFiles(); 201 202 // Writes current contents of the circular buffers to their respective 203 // debug event files and clears the circular buffers. 204 Status FlushExecutionFiles(); 205 206 // Close() calls FlushNonExecutionFiles() and FlushExecutionFiles() 207 // and then closes the current debug events files. 208 Status Close(); 209 210 private: 211 static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* 212 213 // Get a static map from dump-root path to DebugEventsWriter objects. 214 // This helps the per-dump-root singletone pattern. 215 GetDebugEventsWriterMap(); 216 217 // Guards calls to the GetDebugEventsWriter() method. 218 static mutex factory_mu_; 219 220 DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id, 221 int64 circular_buffer_size); 222 223 // Get the path prefix. The same for all files, which differ only in the 224 // suffix. 225 string FileName(DebugEventFileType type); 226 227 // Initialize the TFRecord writer for non-metadata file type. 228 Status InitNonMetadataFile(DebugEventFileType type); 229 230 Status SerializeAndWriteDebugEvent(DebugEvent* debug_event, 231 DebugEventFileType type); 232 233 void SelectWriter(DebugEventFileType type, 234 std::unique_ptr<SingleDebugEventFileWriter>** writer); 235 const string GetSuffix(DebugEventFileType type); 236 string GetFileNameInternal(DebugEventFileType type); 237 238 Env* env_; 239 const string dump_root_; 240 const string tfdbg_run_id_; 241 242 string file_prefix_; 243 bool is_initialized_ TF_GUARDED_BY(initialization_mu_); 244 mutex initialization_mu_; 245 246 const int64 circular_buffer_size_; 247 std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_); 248 mutex execution_buffer_mu_; 249 std::deque<string> graph_execution_trace_buffer_ 250 TF_GUARDED_BY(graph_execution_trace_buffer_mu_); 251 mutex graph_execution_trace_buffer_mu_; 252 253 absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_); 254 mutex device_mu_; 255 256 std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_; 257 std::unique_ptr<SingleDebugEventFileWriter> source_files_writer_; 258 std::unique_ptr<SingleDebugEventFileWriter> stack_frames_writer_; 259 std::unique_ptr<SingleDebugEventFileWriter> graphs_writer_; 260 std::unique_ptr<SingleDebugEventFileWriter> execution_writer_; 261 std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_; 262 263 TF_DISALLOW_COPY_AND_ASSIGN(DebugEventsWriter); 264 265 friend class DebugEventsWriterTest; 266 }; 267 268 } // namespace tfdbg 269 } // namespace tensorflow 270 271 #endif // TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ 272