1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/util/debug_events_writer.h"
17
18 #include "tensorflow/core/lib/io/path.h"
19 #include "tensorflow/core/lib/strings/strcat.h"
20 #include "tensorflow/core/lib/strings/stringprintf.h"
21 #include "tensorflow/core/platform/host_info.h"
22 #include "tensorflow/core/public/version.h"
23
24 namespace tensorflow {
25 namespace tfdbg {
26
27 namespace {
MaybeSetDebugEventTimestamp(DebugEvent * debug_event,Env * env)28 void MaybeSetDebugEventTimestamp(DebugEvent* debug_event, Env* env) {
29 if (debug_event->wall_time() == 0) {
30 debug_event->set_wall_time(env->NowMicros() / 1e6);
31 }
32 }
33 } // namespace
34
SingleDebugEventFileWriter(const string & file_path)35 SingleDebugEventFileWriter::SingleDebugEventFileWriter(const string& file_path)
36 : env_(Env::Default()),
37 file_path_(file_path),
38 num_outstanding_events_(0),
39 writer_mu_() {}
40
Init()41 Status SingleDebugEventFileWriter::Init() {
42 if (record_writer_ != nullptr) {
43 // TODO(cais): We currently don't check for file deletion. When the need
44 // arises, check and fix it.
45 return Status::OK();
46 }
47
48 // Reset recordio_writer (which has a reference to writable_file_) so final
49 // Flush() and Close() call have access to writable_file_.
50 record_writer_.reset();
51
52 TF_RETURN_WITH_CONTEXT_IF_ERROR(
53 env_->NewWritableFile(file_path_, &writable_file_),
54 "Creating writable file ", file_path_);
55 record_writer_.reset(new io::RecordWriter(writable_file_.get()));
56 if (record_writer_ == nullptr) {
57 return errors::Unknown("Could not create record writer at path: ",
58 file_path_);
59 }
60 num_outstanding_events_.store(0);
61 VLOG(1) << "Successfully opened debug events file: " << file_path_;
62 return Status::OK();
63 }
64
WriteSerializedDebugEvent(StringPiece debug_event_str)65 void SingleDebugEventFileWriter::WriteSerializedDebugEvent(
66 StringPiece debug_event_str) {
67 if (record_writer_ == nullptr) {
68 if (!Init().ok()) {
69 LOG(ERROR) << "Write failed because file could not be opened.";
70 return;
71 }
72 }
73 num_outstanding_events_.fetch_add(1);
74 {
75 mutex_lock l(writer_mu_);
76 record_writer_->WriteRecord(debug_event_str).IgnoreError();
77 }
78 }
79
Flush()80 Status SingleDebugEventFileWriter::Flush() {
81 const int num_outstanding = num_outstanding_events_.load();
82 if (num_outstanding == 0) {
83 return Status::OK();
84 }
85 if (writable_file_ == nullptr) {
86 return errors::Unknown("Unexpected NULL file for path: ", file_path_);
87 }
88
89 {
90 mutex_lock l(writer_mu_);
91 TF_RETURN_WITH_CONTEXT_IF_ERROR(record_writer_->Flush(), "Failed to flush ",
92 num_outstanding, " debug events to ",
93 file_path_);
94 }
95
96 TF_RETURN_WITH_CONTEXT_IF_ERROR(writable_file_->Sync(), "Failed to sync ",
97 num_outstanding, " debug events to ",
98 file_path_);
99 num_outstanding_events_.store(0);
100 return Status::OK();
101 }
102
Close()103 Status SingleDebugEventFileWriter::Close() {
104 Status status = Flush();
105 if (writable_file_ != nullptr) {
106 Status close_status = writable_file_->Close();
107 if (!close_status.ok()) {
108 status = close_status;
109 }
110 record_writer_.reset(nullptr);
111 writable_file_.reset(nullptr);
112 }
113 num_outstanding_events_ = 0;
114 return status;
115 }
116
FileName()117 const string SingleDebugEventFileWriter::FileName() { return file_path_; }
118
119 mutex DebugEventsWriter::factory_mu_(LINKER_INITIALIZED);
120
~DebugEventsWriter()121 DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
122
123 // static
GetDebugEventsWriter(const string & dump_root,const string & tfdbg_run_id,int64 circular_buffer_size)124 DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
125 const string& dump_root, const string& tfdbg_run_id,
126 int64 circular_buffer_size) {
127 mutex_lock l(DebugEventsWriter::factory_mu_);
128 std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
129 DebugEventsWriter::GetDebugEventsWriterMap();
130 if (writer_pool->find(dump_root) == writer_pool->end()) {
131 std::unique_ptr<DebugEventsWriter> writer(
132 new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
133 writer_pool->insert(std::make_pair(dump_root, std::move(writer)));
134 }
135 return (*writer_pool)[dump_root].get();
136 }
137
138 // static
LookUpDebugEventsWriter(const string & dump_root,DebugEventsWriter ** debug_events_writer)139 Status DebugEventsWriter::LookUpDebugEventsWriter(
140 const string& dump_root, DebugEventsWriter** debug_events_writer) {
141 mutex_lock l(DebugEventsWriter::factory_mu_);
142 std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
143 DebugEventsWriter::GetDebugEventsWriterMap();
144 if (writer_pool->find(dump_root) == writer_pool->end()) {
145 return errors::FailedPrecondition(
146 "No DebugEventsWriter has been created at dump root ", dump_root);
147 }
148 *debug_events_writer = (*writer_pool)[dump_root].get();
149 return Status::OK();
150 }
151
Init()152 Status DebugEventsWriter::Init() {
153 mutex_lock l(initialization_mu_);
154
155 // TODO(cais): We currently don't check for file deletion. When the need
156 // arises, check and fix file deletion.
157 if (is_initialized_) {
158 return Status::OK();
159 }
160
161 if (!env_->IsDirectory(dump_root_).ok()) {
162 TF_RETURN_WITH_CONTEXT_IF_ERROR(env_->RecursivelyCreateDir(dump_root_),
163 "Failed to create directory ", dump_root_);
164 }
165
166 int64 time_in_seconds = env_->NowMicros() / 1e6;
167 file_prefix_ = io::JoinPath(
168 dump_root_, strings::Printf("%s.%010lld.%s", kFileNamePrefix,
169 static_cast<long long>(time_in_seconds),
170 port::Hostname().c_str()));
171 TF_RETURN_IF_ERROR(InitNonMetadataFile(SOURCE_FILES));
172 TF_RETURN_IF_ERROR(InitNonMetadataFile(STACK_FRAMES));
173 TF_RETURN_IF_ERROR(InitNonMetadataFile(GRAPHS));
174
175 // In case there is one left over from before.
176 metadata_writer_.reset();
177
178 // The metadata file should be created.
179 string metadata_filename = GetFileNameInternal(METADATA);
180 metadata_writer_.reset(new SingleDebugEventFileWriter(metadata_filename));
181 if (metadata_writer_ == nullptr) {
182 return errors::Unknown("Could not create debug event metadata file writer");
183 }
184
185 DebugEvent debug_event;
186 DebugMetadata* metadata = debug_event.mutable_debug_metadata();
187 metadata->set_tensorflow_version(TF_VERSION_STRING);
188 metadata->set_file_version(
189 strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
190 metadata->set_tfdbg_run_id(tfdbg_run_id_);
191 TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
192 TF_RETURN_WITH_CONTEXT_IF_ERROR(
193 metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
194
195 TF_RETURN_IF_ERROR(InitNonMetadataFile(EXECUTION));
196 TF_RETURN_IF_ERROR(InitNonMetadataFile(GRAPH_EXECUTION_TRACES));
197 is_initialized_ = true;
198 return Status::OK();
199 }
200
WriteSourceFile(SourceFile * source_file)201 Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
202 DebugEvent debug_event;
203 debug_event.set_allocated_source_file(source_file);
204 return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
205 }
206
WriteStackFrameWithId(StackFrameWithId * stack_frame_with_id)207 Status DebugEventsWriter::WriteStackFrameWithId(
208 StackFrameWithId* stack_frame_with_id) {
209 DebugEvent debug_event;
210 debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
211 return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
212 }
213
WriteGraphOpCreation(GraphOpCreation * graph_op_creation)214 Status DebugEventsWriter::WriteGraphOpCreation(
215 GraphOpCreation* graph_op_creation) {
216 DebugEvent debug_event;
217 debug_event.set_allocated_graph_op_creation(graph_op_creation);
218 return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
219 }
220
WriteDebuggedGraph(DebuggedGraph * debugged_graph)221 Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
222 DebugEvent debug_event;
223 debug_event.set_allocated_debugged_graph(debugged_graph);
224 return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
225 }
226
WriteExecution(Execution * execution)227 Status DebugEventsWriter::WriteExecution(Execution* execution) {
228 if (circular_buffer_size_ <= 0) {
229 // No cyclic-buffer behavior.
230 DebugEvent debug_event;
231 debug_event.set_allocated_execution(execution);
232 return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
233 } else {
234 // Circular buffer behavior.
235 DebugEvent debug_event;
236 MaybeSetDebugEventTimestamp(&debug_event, env_);
237 debug_event.set_allocated_execution(execution);
238 string serialized;
239 debug_event.SerializeToString(&serialized);
240
241 mutex_lock l(execution_buffer_mu_);
242 execution_buffer_.emplace_back(std::move(serialized));
243 if (execution_buffer_.size() > circular_buffer_size_) {
244 execution_buffer_.pop_front();
245 }
246 return Status::OK();
247 }
248 }
249
WriteGraphExecutionTrace(GraphExecutionTrace * graph_execution_trace)250 Status DebugEventsWriter::WriteGraphExecutionTrace(
251 GraphExecutionTrace* graph_execution_trace) {
252 TF_RETURN_IF_ERROR(Init());
253 if (circular_buffer_size_ <= 0) {
254 // No cyclic-buffer behavior.
255 DebugEvent debug_event;
256 debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
257 return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
258 } else {
259 // Circular buffer behavior.
260 DebugEvent debug_event;
261 MaybeSetDebugEventTimestamp(&debug_event, env_);
262 debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
263 string serialized;
264 debug_event.SerializeToString(&serialized);
265
266 mutex_lock l(graph_execution_trace_buffer_mu_);
267 graph_execution_trace_buffer_.emplace_back(std::move(serialized));
268 if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
269 graph_execution_trace_buffer_.pop_front();
270 }
271 return Status::OK();
272 }
273 }
274
WriteGraphExecutionTrace(const string & tfdbg_context_id,const string & device_name,const string & op_name,int32 output_slot,int32 tensor_debug_mode,const Tensor & tensor_value)275 Status DebugEventsWriter::WriteGraphExecutionTrace(
276 const string& tfdbg_context_id, const string& device_name,
277 const string& op_name, int32 output_slot, int32 tensor_debug_mode,
278 const Tensor& tensor_value) {
279 std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
280 trace->set_tfdbg_context_id(tfdbg_context_id);
281 if (!op_name.empty()) {
282 trace->set_op_name(op_name);
283 }
284 if (output_slot > 0) {
285 trace->set_output_slot(output_slot);
286 }
287 if (tensor_debug_mode > 0) {
288 trace->set_tensor_debug_mode(TensorDebugMode(tensor_debug_mode));
289 }
290 trace->set_device_name(device_name);
291 tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
292 return WriteGraphExecutionTrace(trace.release());
293 }
294
WriteSerializedNonExecutionDebugEvent(const string & debug_event_str,DebugEventFileType type)295 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
296 const string& debug_event_str, DebugEventFileType type) {
297 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
298 SelectWriter(type, &writer);
299 (*writer)->WriteSerializedDebugEvent(debug_event_str);
300 }
301
WriteSerializedExecutionDebugEvent(const string & debug_event_str,DebugEventFileType type)302 void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
303 const string& debug_event_str, DebugEventFileType type) {
304 const std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
305 std::deque<string>* buffer = nullptr;
306 mutex* mu = nullptr;
307 switch (type) {
308 case EXECUTION:
309 writer = &execution_writer_;
310 buffer = &execution_buffer_;
311 mu = &execution_buffer_mu_;
312 break;
313 case GRAPH_EXECUTION_TRACES:
314 writer = &graph_execution_traces_writer_;
315 buffer = &graph_execution_trace_buffer_;
316 mu = &graph_execution_trace_buffer_mu_;
317 break;
318 default:
319 return;
320 }
321
322 if (circular_buffer_size_ <= 0) {
323 // No cyclic-buffer behavior.
324 (*writer)->WriteSerializedDebugEvent(debug_event_str);
325 } else {
326 // Circular buffer behavior.
327 mutex_lock l(*mu);
328 buffer->push_back(debug_event_str);
329 if (buffer->size() > circular_buffer_size_) {
330 buffer->pop_front();
331 }
332 }
333 }
334
RegisterDeviceAndGetId(const string & device_name)335 int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
336 mutex_lock l(device_mu_);
337 int& device_id = device_name_to_id_[device_name];
338 if (device_id == 0) {
339 device_id = device_name_to_id_.size();
340 DebugEvent debug_event;
341 MaybeSetDebugEventTimestamp(&debug_event, env_);
342 DebuggedDevice* debugged_device = debug_event.mutable_debugged_device();
343 debugged_device->set_device_name(device_name);
344 debugged_device->set_device_id(device_id);
345 string serialized;
346 debug_event.SerializeToString(&serialized);
347 graphs_writer_->WriteSerializedDebugEvent(serialized);
348 }
349 return device_id;
350 }
351
FlushNonExecutionFiles()352 Status DebugEventsWriter::FlushNonExecutionFiles() {
353 TF_RETURN_IF_ERROR(Init());
354 if (source_files_writer_ != nullptr) {
355 TF_RETURN_IF_ERROR(source_files_writer_->Flush());
356 }
357 if (stack_frames_writer_ != nullptr) {
358 TF_RETURN_IF_ERROR(stack_frames_writer_->Flush());
359 }
360 if (graphs_writer_ != nullptr) {
361 TF_RETURN_IF_ERROR(graphs_writer_->Flush());
362 }
363 return Status::OK();
364 }
365
FlushExecutionFiles()366 Status DebugEventsWriter::FlushExecutionFiles() {
367 TF_RETURN_IF_ERROR(Init());
368
369 if (execution_writer_ != nullptr) {
370 if (circular_buffer_size_ > 0) {
371 // Write out all the content in the circular buffers.
372 mutex_lock l(execution_buffer_mu_);
373 while (!execution_buffer_.empty()) {
374 execution_writer_->WriteSerializedDebugEvent(execution_buffer_.front());
375 // SerializeAndWriteDebugEvent(&execution_buffer_.front());
376 execution_buffer_.pop_front();
377 }
378 }
379 TF_RETURN_IF_ERROR(execution_writer_->Flush());
380 }
381
382 if (graph_execution_traces_writer_ != nullptr) {
383 if (circular_buffer_size_ > 0) {
384 // Write out all the content in the circular buffers.
385 mutex_lock l(graph_execution_trace_buffer_mu_);
386 while (!graph_execution_trace_buffer_.empty()) {
387 graph_execution_traces_writer_->WriteSerializedDebugEvent(
388 graph_execution_trace_buffer_.front());
389 graph_execution_trace_buffer_.pop_front();
390 }
391 }
392 TF_RETURN_IF_ERROR(graph_execution_traces_writer_->Flush());
393 }
394
395 return Status::OK();
396 }
397
FileName(DebugEventFileType type)398 string DebugEventsWriter::FileName(DebugEventFileType type) {
399 if (file_prefix_.empty()) {
400 Init().IgnoreError();
401 }
402 return GetFileNameInternal(type);
403 }
404
Close()405 Status DebugEventsWriter::Close() {
406 {
407 mutex_lock l(initialization_mu_);
408 if (!is_initialized_) {
409 return Status::OK();
410 }
411 }
412
413 std::vector<string> failed_to_close_files;
414
415 if (metadata_writer_ != nullptr) {
416 if (!metadata_writer_->Close().ok()) {
417 failed_to_close_files.push_back(metadata_writer_->FileName());
418 }
419 metadata_writer_.reset(nullptr);
420 }
421
422 TF_RETURN_IF_ERROR(FlushNonExecutionFiles());
423 if (source_files_writer_ != nullptr) {
424 if (!source_files_writer_->Close().ok()) {
425 failed_to_close_files.push_back(source_files_writer_->FileName());
426 }
427 source_files_writer_.reset(nullptr);
428 }
429 if (stack_frames_writer_ != nullptr) {
430 if (!stack_frames_writer_->Close().ok()) {
431 failed_to_close_files.push_back(stack_frames_writer_->FileName());
432 }
433 stack_frames_writer_.reset(nullptr);
434 }
435 if (graphs_writer_ != nullptr) {
436 if (!graphs_writer_->Close().ok()) {
437 failed_to_close_files.push_back(graphs_writer_->FileName());
438 }
439 graphs_writer_.reset(nullptr);
440 }
441
442 TF_RETURN_IF_ERROR(FlushExecutionFiles());
443 if (execution_writer_ != nullptr) {
444 if (!execution_writer_->Close().ok()) {
445 failed_to_close_files.push_back(execution_writer_->FileName());
446 }
447 execution_writer_.reset(nullptr);
448 }
449 if (graph_execution_traces_writer_ != nullptr) {
450 if (!graph_execution_traces_writer_->Close().ok()) {
451 failed_to_close_files.push_back(
452 graph_execution_traces_writer_->FileName());
453 }
454 graph_execution_traces_writer_.reset(nullptr);
455 }
456
457 if (failed_to_close_files.empty()) {
458 return Status::OK();
459 } else {
460 return errors::FailedPrecondition(
461 "Failed to close %d debug-events files associated with tfdbg",
462 failed_to_close_files.size());
463 }
464 }
465
466 // static
467 std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
GetDebugEventsWriterMap()468 DebugEventsWriter::GetDebugEventsWriterMap() {
469 static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
470 writer_pool =
471 new std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>();
472 return writer_pool;
473 }
474
DebugEventsWriter(const string & dump_root,const string & tfdbg_run_id,int64 circular_buffer_size)475 DebugEventsWriter::DebugEventsWriter(const string& dump_root,
476 const string& tfdbg_run_id,
477 int64 circular_buffer_size)
478 : env_(Env::Default()),
479 dump_root_(dump_root),
480 tfdbg_run_id_(tfdbg_run_id),
481 is_initialized_(false),
482 initialization_mu_(),
483 circular_buffer_size_(circular_buffer_size),
484 execution_buffer_(),
485 execution_buffer_mu_(),
486 graph_execution_trace_buffer_(),
487 graph_execution_trace_buffer_mu_(),
488 device_name_to_id_(),
489 device_mu_() {}
490
InitNonMetadataFile(DebugEventFileType type)491 Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
492 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
493 SelectWriter(type, &writer);
494 const string filename = GetFileNameInternal(type);
495 writer->reset();
496
497 writer->reset(new SingleDebugEventFileWriter(filename));
498 if (*writer == nullptr) {
499 return errors::Unknown("Could not create debug event file writer for ",
500 filename);
501 }
502 TF_RETURN_WITH_CONTEXT_IF_ERROR(
503 (*writer)->Init(), "Initializing debug event writer at path ", filename);
504 VLOG(1) << "Successfully opened debug event file: " << filename;
505
506 return Status::OK();
507 }
508
SerializeAndWriteDebugEvent(DebugEvent * debug_event,DebugEventFileType type)509 Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
510 DebugEventFileType type) {
511 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
512 SelectWriter(type, &writer);
513 if (writer != nullptr) {
514 // Timestamp is in seconds, with double precision.
515 MaybeSetDebugEventTimestamp(debug_event, env_);
516 string str;
517 debug_event->AppendToString(&str);
518 (*writer)->WriteSerializedDebugEvent(str);
519 return Status::OK();
520 } else {
521 return errors::Internal(
522 "Unable to find debug events file writer for DebugEventsFileType ",
523 type);
524 }
525 }
526
SelectWriter(DebugEventFileType type,std::unique_ptr<SingleDebugEventFileWriter> ** writer)527 void DebugEventsWriter::SelectWriter(
528 DebugEventFileType type,
529 std::unique_ptr<SingleDebugEventFileWriter>** writer) {
530 switch (type) {
531 case METADATA:
532 *writer = &metadata_writer_;
533 break;
534 case SOURCE_FILES:
535 *writer = &source_files_writer_;
536 break;
537 case STACK_FRAMES:
538 *writer = &stack_frames_writer_;
539 break;
540 case GRAPHS:
541 *writer = &graphs_writer_;
542 break;
543 case EXECUTION:
544 *writer = &execution_writer_;
545 break;
546 case GRAPH_EXECUTION_TRACES:
547 *writer = &graph_execution_traces_writer_;
548 break;
549 }
550 }
551
GetSuffix(DebugEventFileType type)552 const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
553 switch (type) {
554 case METADATA:
555 return kMetadataSuffix;
556 case SOURCE_FILES:
557 return kSourceFilesSuffix;
558 case STACK_FRAMES:
559 return kStackFramesSuffix;
560 case GRAPHS:
561 return kGraphsSuffix;
562 case EXECUTION:
563 return kExecutionSuffix;
564 case GRAPH_EXECUTION_TRACES:
565 return kGraphExecutionTracesSuffix;
566 default:
567 string suffix;
568 return suffix;
569 }
570 }
571
GetFileNameInternal(DebugEventFileType type)572 string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
573 const string suffix = GetSuffix(type);
574 return strings::StrCat(file_prefix_, ".", suffix);
575 }
576
577 } // namespace tfdbg
578 } // namespace tensorflow
579