1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/debug/debug_graph_utils.h"
17
18 #include "tensorflow/core/common_runtime/memory_types.h"
19 #include "tensorflow/core/framework/kernel_def.pb.h"
20 #include "tensorflow/core/framework/node_def_builder.h"
21 #include "tensorflow/core/framework/op_kernel.h"
22 #include "tensorflow/core/graph/node_builder.h"
23 #include "tensorflow/core/lib/strings/strcat.h"
24 #include "tensorflow/core/protobuf/debug.pb.h"
25
26 namespace tensorflow {
27
28 namespace {
29
30 // TODO(cais): Switch to safe_strtob when available.
ParseBoolString(const string & bool_str,bool * bool_val)31 Status ParseBoolString(const string& bool_str, bool* bool_val) {
32 const string lower_bool_str = absl::AsciiStrToLower(bool_str);
33 if (lower_bool_str == "false" || lower_bool_str == "f" ||
34 lower_bool_str == "0") {
35 *bool_val = false;
36 } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
37 lower_bool_str == "1") {
38 *bool_val = true;
39 } else {
40 return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
41 }
42 return OkStatus();
43 }
44
45 } // namespace
46
47 // static
InsertNodes(const protobuf::RepeatedPtrField<DebugTensorWatch> & watches,Graph * graph,Device * device)48 Status DebugNodeInserter::InsertNodes(
49 const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
50 Device* device) {
51 // TODO(cais): This method is getting too large in size.
52 // Refactor it with helpers.
53
54 if (watches.empty()) {
55 // Nothing to do: Return OK right away.
56 return OkStatus();
57 }
58
59 // Debug ops and URLs for wildcard node names (if any).
60 std::vector<string> default_debug_ops;
61 std::vector<string> default_debug_urls;
62
63 // A map from tensor name (e.g., "node_a:0") to list of debug op names
64 // (e.g., {"DebugIdentity", "DebugNanCount"})
65 std::unordered_map<string, std::vector<string>> tensor_watches;
66 // A map from tensor name to debug_url.
67 std::unordered_map<string, std::vector<string>> tensor_watch_urls;
68 std::unordered_map<string, bool> tensor_tolerate_failures;
69
70 // Cache the proto content for fast lookup later
71 for (const DebugTensorWatch& watch : watches) {
72 if (watch.debug_ops().empty()) {
73 continue;
74 }
75
76 if (watch.debug_urls().empty()) {
77 continue;
78 }
79
80 if (watch.node_name() == "*") {
81 if (watch.output_slot() == -1) {
82 default_debug_ops.insert(default_debug_ops.end(),
83 watch.debug_ops().begin(),
84 watch.debug_ops().end());
85 default_debug_urls.insert(default_debug_urls.end(),
86 watch.debug_urls().begin(),
87 watch.debug_urls().end());
88 } else {
89 return Status(error::FAILED_PRECONDITION,
90 strings::StrCat(
91 "output_slot is expected to be -1 for wildcard ",
92 "node name (\"*\"), but got ", watch.output_slot()));
93 }
94 continue;
95 } else {
96 if (watch.output_slot() < 0) {
97 return Status(
98 error::FAILED_PRECONDITION,
99 strings::StrCat("A negative output_slot in DebugTensorWatch is ",
100 "valid only for the wildcard node name (\"*\"), ",
101 "but got node name ", watch.node_name()));
102 }
103 }
104
105 string tensor_name =
106 strings::StrCat(watch.node_name(), ":", watch.output_slot());
107
108 std::vector<string> debug_ops;
109 for (const string& debug_op : watch.debug_ops()) {
110 debug_ops.push_back(debug_op);
111 }
112
113 tensor_watches[tensor_name] = debug_ops;
114 tensor_tolerate_failures[tensor_name] =
115 watch.tolerate_debug_op_creation_failures();
116
117 std::vector<string> urls;
118 for (const string& url : watch.debug_urls()) {
119 urls.push_back(url);
120 }
121 tensor_watch_urls[tensor_name] = urls;
122 }
123
124 if (tensor_watches.empty()) {
125 return OkStatus();
126 }
127
128 DeviceType device_type = DeviceType{device->device_type()};
129
130 // Keep track of all edges to be removed.
131 std::vector<const Edge*> edges_to_remove;
132
133 for (Node* src_node : graph->nodes()) {
134 // Make a map from output slot to outgoing edges from the slot.
135 std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges;
136 for (const Edge* edge : src_node->out_edges()) {
137 const int src_output = edge->src_output();
138 if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) {
139 output_slot_to_edges[src_output] = {edge};
140 } else {
141 output_slot_to_edges[src_output].push_back(edge);
142 }
143 }
144
145 // Iterate through all output slots of the node.
146 for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
147 ++src_output_slot) {
148 const string tensor_name =
149 strings::StrCat(src_node->name(), ":", src_output_slot);
150 const bool explicit_tensor_match =
151 tensor_watches.find(tensor_name) != tensor_watches.end();
152 if (!explicit_tensor_match && default_debug_ops.empty()) {
153 continue;
154 }
155
156 // Now we have encountered a watched tensor. We will:
157 // 1) Mark this edge as to be removed, iff this is a non-Reference
158 // tensor
159 // 2) Create a Copy node for the tensor
160 // 3) Add a new edge, from the source tensor to the Copy node
161 // 4) Add a new edge, from the Copy node to the destination node, iff
162 // this is a non-Reference tensor.
163 // 5) Create all the requested debug nodes and their edges to the Copy
164 // node.
165 // 6) Add control edges from the debug nodes to the destination nodes
166 // to ensure that the tensors values exported by the debug nodes
167 // to the debug URLs reflect the values before the execution of
168 // the destination nodes.
169
170 const DataType src_dt = src_node->output_type(src_output_slot);
171 MemoryType memory_type;
172 TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
173 src_output_slot, &memory_type));
174
175 // Create the copy node for the watched tensor.
176 const std::vector<string> debug_ops = explicit_tensor_match
177 ? tensor_watches[tensor_name]
178 : default_debug_ops;
179 const std::vector<string> debug_urls =
180 explicit_tensor_match ? tensor_watch_urls[tensor_name]
181 : default_debug_urls;
182 Node* copy_node;
183 Status copy_s =
184 CreateCopyNode(graph, device_type, memory_type == HOST_MEMORY,
185 src_node->name(), src_output_slot, src_dt, tensor_name,
186 debug_ops, debug_urls, ©_node);
187 if (!copy_s.ok()) {
188 return Status(
189 error::FAILED_PRECONDITION,
190 strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
191 tensor_name, ", due to: ", copy_s.error_message()));
192 }
193
194 // Add edge from watched tensor to the copy node.
195 graph->AddEdge(src_node, src_output_slot, copy_node, 0);
196
197 // Create all requested debug nodes and their edges to the Copy node.
198 std::vector<Node*> debug_nodes;
199 for (size_t i = 0; i < debug_ops.size(); ++i) {
200 const string& debug_op_name = debug_ops[i];
201
202 Node* debug_node;
203 Status debug_s = CreateDebugNode(graph, *device, copy_node->name(),
204 src_dt, tensor_name, debug_urls, i,
205 debug_op_name, &debug_node);
206 if (debug_s.ok()) {
207 graph->AddEdge(copy_node, 0, debug_node, 0);
208 debug_nodes.push_back(debug_node);
209 } else {
210 if (tensor_tolerate_failures[tensor_name]) {
211 LOG(INFO) << "Tolerating failure to create debug node: "
212 << "tensor name = " << tensor_name << "; "
213 << "debug op name = " << debug_op_name;
214 } else {
215 return Status(
216 error::FAILED_PRECONDITION,
217 strings::StrCat("Failed to create debug node ", debug_op_name,
218 " for tensor ", tensor_name,
219 ", due to: ", debug_s.error_message()));
220 }
221 }
222 }
223
224 // Is the output a reference?
225 const bool is_ref = IsRefType(src_node->output_type(src_output_slot));
226
227 // Iterate through all outgoing edges attached to the slot.
228 for (const Edge* edge : output_slot_to_edges[src_output_slot]) {
229 // Mark the edge for removal.
230 if (!is_ref) {
231 edges_to_remove.push_back(edge);
232 graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input());
233 }
234
235 // Add control edges from the debug nodes to the destination node
236 // to ensure that the debug nodes are executed before the destination
237 // node. Skip Enter and NextIteration ops to avoid hanging.
238 for (Node* debug_node : debug_nodes) {
239 if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
240 graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
241 Graph::kControlSlot);
242 }
243 }
244 }
245 }
246 }
247
248 // Remove all edges marked for removal.
249 for (const Edge* edge : edges_to_remove) {
250 graph->RemoveEdge(edge);
251 }
252
253 return OkStatus();
254 }
255
DeparallelizeWhileLoops(Graph * graph,Device * device)256 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
257 bool deparallelized_a_loop = false;
258 for (Node* node : graph->nodes()) {
259 if (node->IsEnter()) {
260 const AttrValue* parallel_iterations =
261 node->attrs().Find("parallel_iterations");
262 if (parallel_iterations && parallel_iterations->i() > 1) {
263 deparallelized_a_loop = true;
264 VLOG(1) << "Changing the parallel_iterations attribute of the "
265 << "Enter/RefEnter node \"" << node->name() << "\" on device \""
266 << device->name() << "\" from " << parallel_iterations->i()
267 << " to 1.";
268 node->AddAttr<int64_t>("parallel_iterations", 1);
269 }
270 }
271 }
272 if (deparallelized_a_loop) {
273 LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations "
274 << "attribute of all scheduled Enter/RefEnter nodes to 1. (This "
275 << "does not affect subsequent non-debug runs.)";
276 }
277 }
278
279 // static
GetCopyNodeName(const string & node_name,const int output_slot)280 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
281 const int output_slot) {
282 // For example, if the watched node is named "node1" and the output slot
283 // is 0, the debug node will be called: __copy_node1_0
284 return strings::StrCat("__copy_", node_name, "_", output_slot);
285 }
286
287 // static
GetDebugNodeName(const string & tensor_name,const int debug_op_num,const string & debug_op_name)288 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
289 const int debug_op_num,
290 const string& debug_op_name) {
291 // For example, if the watched node is named "node1" and the debug op that
292 // watches the output slot of node1 is of the type "DebugNanCount", the
293 // debug node will be called: __dbg_node1_0_0_DebugNanCount.
294 return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_",
295 debug_op_name);
296 }
297
298 // static
CreateCopyNode(Graph * graph,const DeviceType device_type,const bool is_host_memory,const string & src_node_name,const int src_output,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_ops,const std::vector<string> & debug_urls,Node ** copy_node)299 Status DebugNodeInserter::CreateCopyNode(
300 Graph* graph, const DeviceType device_type, const bool is_host_memory,
301 const string& src_node_name, const int src_output, const DataType src_dt,
302 const string& tensor_name, const std::vector<string>& debug_ops,
303 const std::vector<string>& debug_urls, Node** copy_node) {
304 const string kGatedGrpcAttributeKey = "gated_grpc";
305
306 NodeDef node_def;
307 const KernelDef* kdef;
308
309 const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
310 const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
311
312 // Cross debug_ops and debug_urls to get the list of debug ops and watches.
313 std::vector<string> debug_ops_spec;
314 for (const string& debug_op : debug_ops) {
315 for (const string& debug_url : debug_urls) {
316 string debug_op_name_proper;
317 std::unordered_map<string, string> custom_attributes;
318 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
319 &custom_attributes));
320
321 bool gated_grpc_value = false;
322 if (custom_attributes.find(kGatedGrpcAttributeKey) !=
323 custom_attributes.end()) {
324 TF_RETURN_IF_ERROR(ParseBoolString(
325 custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
326 }
327 debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
328 debug_url, ";",
329 gated_grpc_value ? "1" : "0"));
330 }
331 }
332
333 auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
334 .Input(src_node_name, src_output, src_dt)
335 .Attr("debug_ops_spec", debug_ops_spec);
336
337 if (!builder.Finalize(&node_def).ok()) {
338 return Status(
339 error::FAILED_PRECONDITION,
340 strings::StrCat("Failed to create node definition ", "for copy op ",
341 copy_node_name, " on watched tensor ", tensor_name));
342 }
343 Status s = FindKernelDef(device_type, node_def, &kdef, nullptr);
344
345 if (!s.ok()) {
346 return Status(
347 error::FAILED_PRECONDITION,
348 strings::StrCat("Failed to find kernel definition ", "for copy op ",
349 copy_node_name, " on watched tensor ", tensor_name));
350 }
351 if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
352 return Status(error::FAILED_PRECONDITION,
353 strings::StrCat("Failed to create copy node ", copy_node_name,
354 " on watched tensor ", tensor_name));
355 }
356
357 return OkStatus();
358 }
359
360 // static
ParseDebugOpName(const string & debug_op_name,string * debug_op_name_proper,std::unordered_map<string,string> * attributes)361 Status DebugNodeInserter::ParseDebugOpName(
362 const string& debug_op_name, string* debug_op_name_proper,
363 std::unordered_map<string, string>* attributes) {
364 const size_t l_index = debug_op_name.find('(');
365 const size_t r_index = debug_op_name.find(')');
366 if (l_index == string::npos && r_index == string::npos) {
367 *debug_op_name_proper = debug_op_name;
368 } else {
369 if (l_index == string::npos || l_index == 0 ||
370 r_index != debug_op_name.size() - 1) {
371 return errors::InvalidArgument("Malformed debug op name \"",
372 debug_op_name, "\"");
373 }
374
375 *debug_op_name_proper = debug_op_name.substr(0, l_index);
376 string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
377
378 std::vector<string> attribute_segs = str_util::Split(arguments, ";");
379 for (const string& attribute_seg : attribute_segs) {
380 StringPiece seg(attribute_seg);
381 str_util::RemoveWhitespaceContext(&seg);
382 if (seg.empty()) {
383 continue;
384 }
385
386 const size_t eq_index = seg.find('=');
387 if (eq_index == string::npos) {
388 return errors::InvalidArgument(
389 "Malformed attributes in debug op name \"", debug_op_name, "\"");
390 }
391
392 const string key(seg.substr(0, eq_index));
393 const string value(
394 seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
395 if (key.empty() || value.empty()) {
396 return errors::InvalidArgument(
397 "Malformed attributes in debug op name \"", debug_op_name, "\"");
398 }
399
400 if (attributes->find(key) == attributes->end()) {
401 (*attributes)[key] = value;
402 } else {
403 return errors::InvalidArgument("Duplicate attribute name \"", key,
404 "\" found in the debug op: \"",
405 debug_op_name, "\"");
406 }
407 }
408 }
409 return OkStatus();
410 }
411
412 // static
SetDebugNodeAttributes(Node * debug_node,const std::unordered_map<string,string> & attributes)413 Status DebugNodeInserter::SetDebugNodeAttributes(
414 Node* debug_node, const std::unordered_map<string, string>& attributes) {
415 std::unordered_set<string> unfulfilled_keys;
416 for (const auto& item : attributes) {
417 unfulfilled_keys.insert(item.first);
418 }
419
420 for (const auto& attr : debug_node->op_def().attr()) {
421 if (attributes.find(attr.name()) != attributes.end()) {
422 const string& attr_value = attributes.at(attr.name());
423 if (attr.type() == "string") {
424 debug_node->AddAttr<string>(attr.name(), attr_value);
425 } else if (attr.type() == "float") {
426 float float_value = 0.0;
427 if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
428 &float_value)) {
429 return errors::InvalidArgument(
430 "Invalid value string for float-type attribute ", attr.name(),
431 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
432 }
433 debug_node->AddAttr<float>(attr.name(), float_value);
434 } else if (attr.type() == "int") {
435 int64_t int_value = 0;
436 if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
437 return errors::InvalidArgument(
438 "Invalid value string for int-type attribute ", attr.name(),
439 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
440 }
441 debug_node->AddAttr<int>(attr.name(), int_value);
442 } else if (attr.type() == "bool") {
443 bool bool_value;
444 if (!ParseBoolString(attr_value, &bool_value).ok()) {
445 return errors::InvalidArgument(
446 "Invalid value string for bool-type attribute ", attr.name(),
447 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
448 }
449 debug_node->AddAttr<bool>(attr.name(), bool_value);
450 } else {
451 return errors::InvalidArgument(
452 "Unsupported type of custom attribute for debug ops: ",
453 attr.type());
454 }
455
456 unfulfilled_keys.erase(attr.name());
457 }
458 }
459
460 if (unfulfilled_keys.empty()) {
461 return OkStatus();
462 } else {
463 return errors::InvalidArgument(
464 unfulfilled_keys.size(),
465 " attribute key(s) were not valid for debug node ", debug_node->name(),
466 ": ", absl::StrJoin(unfulfilled_keys, ", "));
467 }
468 }
469
470 // static
CreateDebugNode(Graph * graph,const Device & device,const string & src_copy_node_name,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_urls,const int debug_op_num,const string & debug_op_name,Node ** debug_node)471 Status DebugNodeInserter::CreateDebugNode(
472 Graph* graph, const Device& device, const string& src_copy_node_name,
473 const DataType src_dt, const string& tensor_name,
474 const std::vector<string>& debug_urls, const int debug_op_num,
475 const string& debug_op_name, Node** debug_node) {
476 NodeDef node_def;
477 const KernelDef* kdef;
478
479 string debug_op_name_proper;
480 std::unordered_map<string, string> custom_attributes;
481 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
482 &custom_attributes));
483
484 const string debug_node_name =
485 GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
486 auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
487 .Input(src_copy_node_name, 0, src_dt)
488 .Attr("device_name", device.name())
489 .Attr("tensor_name", tensor_name)
490 .Attr("debug_urls", debug_urls);
491
492 if (!builder.Finalize(&node_def).ok()) {
493 return errors::FailedPrecondition(
494 "Failed to create node definition for debug op ", debug_op_name_proper,
495 " on watched tensor ", tensor_name);
496 }
497 if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
498 .ok()) {
499 return errors::FailedPrecondition(
500 "Failed to find kernel definition for debug op ", debug_op_name_proper,
501 " on watched tensor ", tensor_name);
502 }
503 if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
504 return errors::FailedPrecondition("Failed to create debug node ",
505 debug_op_name_proper,
506 " on watched tensor ", tensor_name);
507 }
508
509 // Set custom attributes (if any).
510 if (!custom_attributes.empty()) {
511 TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
512 }
513
514 return OkStatus();
515 }
516
517 } // namespace tensorflow
518