• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
18 
19 #include <map>
20 #include <set>
21 #include <string>
22 #include <vector>
23 
24 #include "tensorflow/core/framework/allocation_description.pb.h"
25 #include "tensorflow/core/framework/attr_value.pb.h"
26 #include "tensorflow/core/framework/node_def.pb.h"
27 #include "tensorflow/core/framework/step_stats.pb.h"
28 #include "tensorflow/core/framework/tensor_description.pb.h"
29 #include "tensorflow/core/framework/tensor_shape.pb.h"
30 #include "tensorflow/core/lib/core/errors.h"
31 #include "tensorflow/core/lib/strings/str_util.h"
32 #include "tensorflow/core/lib/strings/strcat.h"
33 #include "tensorflow/core/platform/regexp.h"
34 #include "tensorflow/core/profiler/tfprof_log.pb.h"
35 #include "tensorflow/core/profiler/tfprof_options.h"
36 
37 namespace tensorflow {
38 namespace tfprof {
39 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb);
40 
41 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec);
42 
43 class TFGraphNode;
44 
45 class CallStack {
46  public:
47   class Trace {
48    public:
Trace(const CodeDef::Trace * trace,const std::map<int64,string> * id_to_string)49     Trace(const CodeDef::Trace* trace,
50           const std::map<int64, string>* id_to_string)
51         : trace_(trace), id_to_string_(id_to_string) {}
52 
lineno()53     const int32 lineno() const { return trace_->lineno(); }
file()54     string file() const {
55       // Backward compatible with old proto files.
56       if (!trace_->file().empty()) return trace_->file();
57       return id_to_string_->at(trace_->file_id());
58     }
function()59     string function() const {
60       // Backward compatible with old proto files.
61       if (!trace_->function().empty()) return trace_->function();
62       return id_to_string_->at(trace_->function_id());
63     }
func_start_line()64     int32 func_start_line() const { return trace_->func_start_line(); }
65 
66    private:
67     const CodeDef::Trace* trace_;
68     const std::map<int64, string>* id_to_string_;
69   };
70 
CallStack(const CodeDef & def,const std::map<int64,string> * id_to_string)71   CallStack(const CodeDef& def, const std::map<int64, string>* id_to_string)
72       : def_(def) {
73     traces_.reserve(def.traces_size());
74     for (const auto& t : def_.traces()) {
75       traces_.emplace_back(&t, id_to_string);
76     }
77   }
78 
code_def()79   const CodeDef& code_def() const { return def_; }
traces()80   const std::vector<Trace>& traces() const { return traces_; }
81 
82  private:
83   std::vector<Trace> traces_;
84   CodeDef def_;
85 };
86 
87 class ExecStep {
88  public:
ExecStep()89   ExecStep() {}
90 
91   void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
92 
93   void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
94 
run_count()95   int64 run_count() const { return exec_.run_count(); }
96   // The execution time of an op. If it runs on accelerator, then it's
97   // accelerator_exec_micros(). Otherwise, it's CPU time.
98   int64 exec_micros() const;
99   // The accelerator execution time of an op. 0 if not run on accelerator.
100   int64 accelerator_exec_micros() const;
101   // The cpu execution time of an op.
102   int64 cpu_exec_micros() const;
103 
op_execs()104   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs()
105       const {
106     return op_execs_;
107   }
cpu_execs()108   const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs()
109       const {
110     return cpu_execs_;
111   }
all_start_micros()112   int64 all_start_micros() const { return exec_.all_start_micros(); }
latest_end_micros()113   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
lastest_schedule_end_micros()114   int64 lastest_schedule_end_micros() const {
115     int64 ret = 0;
116     for (const auto& exec : cpu_execs_) {
117       for (const auto& pair : exec.second) {
118         ret = std::max(ret, pair.first + pair.second);
119       }
120     }
121     return ret;
122   }
requested_bytes()123   int64 requested_bytes() const {
124     int64 requested_bytes = 0;
125     for (const ExecMemory& exec : memory_execs_) {
126       requested_bytes += exec.requested_bytes();
127     }
128     return requested_bytes;
129   }
peak_bytes()130   int64 peak_bytes() const {
131     int64 peak_bytes = 0;
132     for (const ExecMemory& exec : memory_execs_) {
133       peak_bytes += exec.peak_bytes();
134     }
135     return peak_bytes;
136   }
residual_bytes()137   int64 residual_bytes() const {
138     int64 residual_bytes = 0;
139     for (const ExecMemory& exec : memory_execs_) {
140       residual_bytes += exec.residual_bytes();
141     }
142     return residual_bytes;
143   }
output_bytes()144   int64 output_bytes() const {
145     int64 output_bytes = 0;
146     for (const ExecMemory& exec : memory_execs_) {
147       output_bytes += exec.output_bytes();
148     }
149     return output_bytes;
150   }
accelerator_temp_bytes()151   int64 accelerator_temp_bytes() const {
152     int64 accelerator_temp_bytes = 0;
153     for (const ExecMemory& exec : memory_execs_) {
154       accelerator_temp_bytes += exec.accelerator_temp_bytes();
155     }
156     return accelerator_temp_bytes;
157   }
host_temp_bytes()158   int64 host_temp_bytes() const {
159     int64 host_temp_bytes = 0;
160     for (const ExecMemory& exec : memory_execs_) {
161       host_temp_bytes += exec.host_temp_bytes();
162     }
163     return host_temp_bytes;
164   }
accelerator_persistent_bytes()165   int64 accelerator_persistent_bytes() const {
166     int64 accelerator_persistent_bytes = 0;
167     for (const ExecMemory& exec : memory_execs_) {
168       accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
169     }
170     return accelerator_persistent_bytes;
171   }
host_persistent_bytes()172   int64 host_persistent_bytes() const {
173     int64 host_persistent_bytes = 0;
174     for (const ExecMemory& exec : memory_execs_) {
175       host_persistent_bytes += exec.host_persistent_bytes();
176     }
177     return host_persistent_bytes;
178   }
allocator_bytes_in_use()179   std::map<int64, int64> allocator_bytes_in_use() const {
180     std::map<int64, int64> bytes_in_use;
181     for (const ExecMemory& exec : memory_execs_) {
182       bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
183     }
184     return bytes_in_use;
185   }
186 
allocations()187   const std::vector<AllocationRecord>& allocations() const {
188     return allocations_;
189   }
190 
ToProto()191   const ExecProfile& ToProto() {
192     exec_.mutable_accelerator_execs()->clear();
193     for (const auto& e : accelerator_execs_) {
194       auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first];
195       for (const auto& p : e.second) {
196         auto* t = exec_time.mutable_times()->Add();
197         t->add_int64_values(p.first);
198         t->add_int64_values(p.second);
199       }
200     }
201 
202     exec_.mutable_cpu_execs()->clear();
203     for (const auto& e : cpu_execs_) {
204       auto& exec_time = (*exec_.mutable_cpu_execs())[e.first];
205       for (const auto& p : e.second) {
206         auto* t = exec_time.mutable_times()->Add();
207         t->add_int64_values(p.first);
208         t->add_int64_values(p.second);
209       }
210     }
211 
212     exec_.mutable_devices()->Clear();
213     exec_.mutable_devices()->Reserve(devices_.size());
214     for (const string& d : devices_) {
215       exec_.add_devices(d);
216     }
217     exec_.mutable_allocations()->Clear();
218     for (const auto& r : allocations_) {
219       exec_.add_allocations()->MergeFrom(r);
220     }
221 
222     exec_.mutable_memory_execs()->Clear();
223     for (const auto& m : memory_execs_) {
224       exec_.add_memory_execs()->MergeFrom(m);
225     }
226     return exec_;
227   }
228 
FromProto(const ExecProfile & exec)229   void FromProto(const ExecProfile& exec) {
230     exec_.Clear();
231     exec_.MergeFrom(exec);
232 
233     devices_.clear();
234     devices_.insert(exec.devices().begin(), exec.devices().end());
235 
236     accelerator_execs_.clear();
237     cpu_execs_.clear();
238     op_execs_.clear();
239 
240     allocations_.clear();
241     memory_execs_.clear();
242 
243     for (const auto& exec_time : exec_.accelerator_execs()) {
244       auto& exec = accelerator_execs_[exec_time.first];
245       auto& op_exec = op_execs_[exec_time.first];
246       for (const auto& p : exec_time.second.times()) {
247         exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
248         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
249       }
250     }
251     for (const auto& exec_time : exec_.cpu_execs()) {
252       auto& exec = cpu_execs_[exec_time.first];
253       auto& op_exec = op_execs_[exec_time.first];
254       for (const auto& p : exec_time.second.times()) {
255         exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
256         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
257       }
258     }
259     for (const auto& r : exec_.allocations()) {
260       allocations_.push_back(r);
261     }
262     for (const auto& m : exec_.memory_execs()) {
263       memory_execs_.push_back(m);
264     }
265   }
266 
267  private:
268   ExecProfile exec_;
269   // device -> vector of {op_start_micros, op_exec_micros} pairs.
270   // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
271   // For accelerator, vector size can be larger than 1, multiple kernel fires
272   // or in tf.while_loop.
273   std::map<string, std::vector<std::pair<int64, int64>>> accelerator_execs_;
274   // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
275   // For cpu, vector size can be larger than 1 if in tf.while_loop.
276   std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
277   // combines accelerator_execs_ and cpu_execs_.
278   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
279   // Each ExecMemory corresponds to one scheduling of the op. Normally,
280   // there are multiple schedulings in while_loop.
281   std::vector<ExecMemory> memory_execs_;
282   // All devices the op is associated with (e.g. gpu:0 (scheduling),
283   // gpu:0:stream:xx (kernel exec), cpu:0 host)
284   std::set<string> devices_;
285 
286   // The history of accelerator allocations and deallocations of this step.
287   std::vector<AllocationRecord> allocations_;
288 };
289 
290 #define GRAPH_NODE_BYTES(type)             \
291   do {                                     \
292     if (execs_.empty()) {                  \
293       return 0;                            \
294     }                                      \
295     if (step >= 0) {                       \
296       auto exec = execs_.find(step);       \
297       if (exec == execs_.end()) return 0;  \
298       return exec->second.type##_bytes();  \
299     }                                      \
300                                            \
301     int64 bytes = 0;                       \
302     for (const auto& exec : execs_) {      \
303       bytes += exec.second.type##_bytes(); \
304     }                                      \
305     return bytes / execs_.size();          \
306   } while (0)
307 
308 class TFGraphNode {
309  public:
TFGraphNode(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)310   TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
311               const std::map<int64, string>* id_to_string,
312               const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
313     nodes_map_ = nodes_map;
314     FromProto(node, profile, id_to_string);
315   }
316 
TFGraphNode(const NodeDef * node,int64 id,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)317   TFGraphNode(const NodeDef* node, int64 id,
318               const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
319     nodes_map_ = nodes_map;
320     node_.set_id(id);
321     node_.set_name(node->name());
322     node_.set_op(node->op());
323     node_.set_float_ops(0);
324 
325     for (const auto& attr : node->attr()) {
326       (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second);
327       if (attr.first == "shape" && attr.second.has_shape()) {
328         if (!shape_.empty()) {
329           fprintf(stderr, "Found duplicated shapes!\n");
330           continue;
331         }
332         shape_ = ShapeProtoToVec(attr.second.shape());
333       } else if (attr.first == "_output_shapes" && attr.second.has_list()) {
334         if (!output_shapes_.empty()) {
335           fprintf(stderr, "Found duplicated output shapes!\n");
336           continue;
337         }
338         for (int i = 0; i < attr.second.list().shape_size(); ++i) {
339           output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i));
340         }
341       }
342     }
343     op_types_.insert(node->op());
344   }
345 
AddInput(const string & input,int64 output_index,int input_idx)346   void AddInput(const string& input, int64 output_index, int input_idx) {
347     inputs_[input_idx] = input;
348     src_output_idx_[input] = output_index;
349   }
350 
AddOpType(const string & op_type)351   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
352 
353   void AddStepStat(int64 step, const string& device,
354                    const NodeExecStats& step_stat);
355 
AddFloatOps(int64 float_ops)356   void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); }
357 
358   // TODO(xpan): This could take a lot of memory.
AddCode(const CodeDef & code,const std::map<int64,string> * id_to_string)359   void AddCode(const CodeDef& code,
360                const std::map<int64, string>* id_to_string) {
361     if (!call_stack_) {
362       call_stack_.reset(new CallStack(code, id_to_string));
363     }
364   }
365 
name()366   const string& name() const { return node_.name(); }
id()367   int64 id() const { return node_.id(); }
op()368   const string& op() const { return node_.op(); }
node()369   const ProfileNode& node() { return node_; }
370 
trackable(int64 step)371   bool trackable(int64 step) const {
372     auto exec = execs_.find(step);
373     if (exec == execs_.end()) return false;
374 
375     if (exec->second.all_start_micros() == 0) return false;
376     if (node_.canonical_device().empty() || node_.host_device().empty()) {
377       return false;
378     }
379     return true;
380   }
381 
ToProto(const std::map<string,std::unique_ptr<TFGraphNode>> & nodes_map)382   const ProfileNode& ToProto(
383       const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) {
384     node_.clear_shape();
385     node_.mutable_shape()->Reserve(shape().size());
386     for (int64 s : shape()) {
387       node_.add_shape(s);
388     }
389 
390     node_.clear_op_types();
391     node_.mutable_op_types()->Reserve(op_types().size());
392     for (const string& t : op_types()) {
393       node_.add_op_types(t);
394     }
395 
396     node_.clear_execs();
397     for (auto& exec : execs_) {
398       auto& exec_pb = (*node_.mutable_execs())[exec.first];
399       exec_pb.MergeFrom(exec.second.ToProto());
400     }
401 
402     node_.clear_inputs();
403     for (const auto& inp : inputs_) {
404       (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id();
405     }
406 
407     node_.clear_input_shapes();
408     for (const auto& s : input_shapes_) {
409       auto& shape = (*node_.mutable_input_shapes())[s.first];
410       for (int64 d : s.second) {
411         shape.add_int64_values(d);
412       }
413     }
414 
415     node_.clear_output_shapes();
416     for (const auto& s : output_shapes_) {
417       auto& shape = (*node_.mutable_output_shapes())[s.first];
418       for (int64 d : s.second) {
419         shape.add_int64_values(d);
420       }
421     }
422 
423     node_.clear_src_output_index();
424     for (const auto& s : src_output_idx_) {
425       int64 id = nodes_map.at(s.first)->id();
426       (*node_.mutable_src_output_index())[id] = s.second;
427     }
428 
429     if (call_stack_) {
430       node_.clear_trace();
431       node_.mutable_trace()->MergeFrom(call_stack_->code_def());
432     }
433     return node_;
434   }
435 
FromProto(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string)436   void FromProto(const ProfileNode& node, const ProfileProto& profile,
437                  const std::map<int64, string>* id_to_string) {
438     node_.Clear();
439     node_.MergeFrom(node);
440 
441     call_stack_.reset(new CallStack(node.trace(), id_to_string));
442 
443     op_types_.clear();
444     op_types_.insert(node_.op_types().begin(), node_.op_types().end());
445 
446     shape_.clear();
447     for (int64 s : node_.shape()) {
448       shape_.push_back(s);
449     }
450 
451     execs_.clear();
452     for (const auto& exec_pb : node.execs()) {
453       auto& exec = execs_[exec_pb.first];
454       exec.FromProto(exec_pb.second);
455     }
456 
457     inputs_.clear();
458     for (const auto& inp : node.inputs()) {
459       inputs_[inp.first] = profile.nodes().at(inp.second).name();
460     }
461 
462     input_shapes_.clear();
463     for (const auto& s : node.input_shapes()) {
464       auto& shape = input_shapes_[s.first];
465       for (const int64 d : s.second.int64_values()) {
466         shape.push_back(d);
467       }
468     }
469 
470     output_shapes_.clear();
471     for (const auto& s : node.output_shapes()) {
472       auto& shape = output_shapes_[s.first];
473       for (const int64 d : s.second.int64_values()) {
474         shape.push_back(d);
475       }
476     }
477 
478     src_output_idx_.clear();
479     for (const auto& s : node.src_output_index()) {
480       src_output_idx_[profile.nodes().at(s.first).name()] = s.second;
481     }
482   }
483 
inputs()484   const std::map<int32, string>& inputs() const { return inputs_; }
485 
486   // Number of times the graph node is executed. When step < 0, the
487   // average number of times executed across all steps.
run_count(int64 step)488   int64 run_count(int64 step) const {
489     if (execs_.empty()) {
490       return 0;
491     }
492     if (step >= 0) {
493       auto exec = execs_.find(step);
494       if (exec == execs_.end()) {
495         return 0;
496       }
497       return exec->second.run_count();
498     }
499     int64 total_run_count = 0;
500     for (const auto& exec : execs_) {
501       total_run_count += exec.second.run_count();
502     }
503     return total_run_count / execs_.size();
504   }
505   // This is overall computation time, including both cpu and accelerator.
506   // Note, cpu and accelerator might or might not run in parallel.
exec_micros(int64 step)507   int64 exec_micros(int64 step) const {
508     // Empty when no RunMetadata is provided.
509     if (execs_.empty()) {
510       return 0;
511     }
512     if (step >= 0) {
513       auto exec = execs_.find(step);
514       if (exec == execs_.end()) {
515         return 0;
516       }
517       return exec->second.exec_micros();
518     }
519 
520     int64 total_micros = 0;
521     for (const auto& exec : execs_) {
522       total_micros += exec.second.exec_micros();
523     }
524     return total_micros / execs_.size();
525   }
526 
527   // This is accelerator computation time of a step, or average of
528   // multiple step, when step < 0.
accelerator_exec_micros(int64 step)529   int64 accelerator_exec_micros(int64 step) const {
530     // Empty when no RunMetadata is provided.
531     if (execs_.empty()) {
532       return 0;
533     }
534     if (step >= 0) {
535       auto exec = execs_.find(step);
536       if (exec == execs_.end()) {
537         return 0;
538       }
539       return exec->second.accelerator_exec_micros();
540     }
541 
542     int64 total_micros = 0;
543     for (const auto& exec : execs_) {
544       total_micros += exec.second.accelerator_exec_micros();
545     }
546     return total_micros / execs_.size();
547   }
548 
549   // This is cpu computation time of a step, or average of
550   // multiple step, when step < 0.
cpu_exec_micros(int64 step)551   int64 cpu_exec_micros(int64 step) const {
552     // Empty when no RunMetadata is provided.
553     if (execs_.empty()) {
554       return 0;
555     }
556     if (step >= 0) {
557       auto exec = execs_.find(step);
558       if (exec == execs_.end()) {
559         return 0;
560       }
561       return exec->second.cpu_exec_micros();
562     }
563 
564     int64 total_micros = 0;
565     for (const auto& exec : execs_) {
566       total_micros += exec.second.cpu_exec_micros();
567     }
568     return total_micros / execs_.size();
569   }
570 
requested_bytes(int64 step)571   int64 requested_bytes(int64 step) const { GRAPH_NODE_BYTES(requested); }
peak_bytes(int64 step)572   int64 peak_bytes(int64 step) const { GRAPH_NODE_BYTES(peak); }
residual_bytes(int64 step)573   int64 residual_bytes(int64 step) const { GRAPH_NODE_BYTES(residual); }
output_bytes(int64 step)574   int64 output_bytes(int64 step) const { GRAPH_NODE_BYTES(output); }
575 
all_start_micros(int64 step)576   int64 all_start_micros(int64 step) const {
577     auto exec = execs_.find(step);
578     if (exec == execs_.end()) {
579       return 0;
580     }
581     return exec->second.all_start_micros();
582   }
583 
latest_end_micros(int64 step)584   int64 latest_end_micros(int64 step) const {
585     auto exec = execs_.find(step);
586     if (exec == execs_.end()) {
587       return 0;
588     }
589     return exec->second.latest_end_micros();
590   }
591 
lastest_schedule_end_micros(int64 step)592   int64 lastest_schedule_end_micros(int64 step) const {
593     auto exec = execs_.find(step);
594     if (exec == execs_.end()) {
595       return 0;
596     }
597     return exec->second.lastest_schedule_end_micros();
598   }
599 
op_execs(int64 step)600   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
601       int64 step) const {
602     auto exec = execs_.find(step);
603     if (exec == execs_.end()) {
604       return empty_execs_;
605     }
606     return exec->second.op_execs();
607   }
cpu_execs(int64 step)608   const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs(
609       int64 step) const {
610     auto exec = execs_.find(step);
611     if (exec == execs_.end()) {
612       return empty_execs_;
613     }
614     return exec->second.cpu_execs();
615   }
616 
all_op_execs()617   const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
618 
accelerator_temp_bytes(int64 step)619   int64 accelerator_temp_bytes(int64 step) const {
620     auto exec = execs_.find(step);
621     if (exec == execs_.end()) {
622       return 0;
623     }
624     return exec->second.accelerator_temp_bytes();
625   }
host_temp_bytes(int64 step)626   int64 host_temp_bytes(int64 step) const {
627     auto exec = execs_.find(step);
628     if (exec == execs_.end()) {
629       return 0;
630     }
631     return exec->second.host_temp_bytes();
632   }
accelerator_persistent_bytes()633   int64 accelerator_persistent_bytes() const {
634     int64 persistent_bytes = 0;
635     for (const auto& exec : execs_) {
636       persistent_bytes = std::max(persistent_bytes,
637                                   exec.second.accelerator_persistent_bytes());
638     }
639     return persistent_bytes;
640   }
allocator_bytes_in_use(int64 step)641   const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
642     auto exec = execs_.find(step);
643     if (exec == execs_.end()) {
644       return empty_bytes_in_use_;
645     }
646     return exec->second.allocator_bytes_in_use();
647   }
648 
allocations(int64 step)649   const std::vector<AllocationRecord>& allocations(int64 step) const {
650     auto exec = execs_.find(step);
651     if (exec == execs_.end()) {
652       return empty_allocations_;
653     }
654     return exec->second.allocations();
655   }
656 
parameters()657   int64 parameters() const {
658     if (!shape().empty()) {
659       int64 params = 1;
660       bool complete_shape = true;
661       for (int64 d : shape()) {
662         // Sometimes parameters could be <0 when a dim is unknown.
663         if (d < 0) {
664           complete_shape = false;
665           break;
666         }
667         params *= d;
668       }
669       if (complete_shape) {
670         return params;
671       } else {
672         fprintf(stderr, "Incomplete shape.\n");
673       }
674     }
675     return 0;
676   }
677 
float_ops(int64 step)678   int64 float_ops(int64 step) const {
679     // If not run, return static analysis.
680     if (execs_.empty()) {
681       return node_.float_ops();
682     }
683     // Otherwise, return dynamic float_ops.
684     return node_.float_ops() * run_count(step);
685   }
call_stack()686   const CallStack* call_stack() { return call_stack_.get(); }
canonical_device()687   string canonical_device() const { return node_.canonical_device(); }
host_device()688   string host_device() const { return node_.host_device(); }
op_types()689   const std::set<string>& op_types() const { return op_types_; }
690 
op_attrs(const string & name)691   const AttrValue* op_attrs(const string& name) const {
692     const auto it = node_.attrs().find(name);
693     if (it == node_.attrs().end()) {
694       return nullptr;
695     }
696     return &it->second;
697   }
698 
shape()699   const std::vector<int64>& shape() const { return shape_; }
700 
output_shapes()701   const std::map<int, std::vector<int64>>& output_shapes() const {
702     return output_shapes_;
703   }
704 
input_shapes()705   const std::map<int, std::vector<int64>> input_shapes() const {
706     std::map<int, std::vector<int64>> input_shapes;
707     for (const auto& inp : inputs_) {
708       // Always create an empty vec even if the shape info might be missing.
709       std::vector<int64>& shape_vec = input_shapes[inp.first];
710       if (!nodes_map_) continue;
711       auto input_it = nodes_map_->find(inp.second);
712       if (input_it == nodes_map_->end()) continue;
713       auto output_it = src_output_idx_.find(inp.second);
714       if (output_it == src_output_idx_.end()) continue;
715 
716       const TFGraphNode* input_node = input_it->second.get();
717       if (!input_node) continue;
718       const auto& output_shapes = input_node->output_shapes();
719       const auto& output_shape = output_shapes.find(output_it->second);
720       if (output_shape == output_shapes.end()) continue;
721 
722       if (output_shape != input_node->output_shapes().end()) {
723         shape_vec.assign(output_shape->second.begin(),
724                          output_shape->second.end());
725       }
726     }
727     return input_shapes;
728   }
729 
730  private:
731   // maps graph node name to TFGraphNode. Not owned.
732   const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
733   // inputs to the node. input index -> input node name.
734   std::map<int, string> inputs_;
735   // The output index of the source node.
736   std::map<string, int32> src_output_idx_;
737   // proto for serialize/deserialized representation of the node.
738   ProfileNode node_;
739   // Python call stack that creates the name.
740   std::unique_ptr<CallStack> call_stack_;
741   // Shape of the node (e.g. Variable) if available.
742   std::vector<int64> shape_;
743   // Won't missing input_idx. But some shapes might be empty (unknown).
744   std::map<int, std::vector<int64>> input_shapes_;
745   // Could miss output_idx if no _output_shapes attr. some shapes can also
746   // be empty.
747   std::map<int, std::vector<int64>> output_shapes_;
748 
749   std::set<string> op_types_;
750 
751   std::map<int64, ExecStep> execs_;
752 
753   // Placeholder for empty cases.
754   std::map<int64, int64> empty_bytes_in_use_;
755   std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
756   std::vector<AllocationRecord> empty_allocations_;
757 };
758 
759 class TFMultiGraphNode {
760  public:
TFMultiGraphNode(const string & name)761   TFMultiGraphNode(const string& name)
762       : name_(name),
763         step_(-1),
764         run_count_(0),
765         exec_micros_(0),
766         accelerator_exec_micros_(0),
767         cpu_exec_micros_(0),
768         requested_bytes_(0),
769         peak_bytes_(0),
770         residual_bytes_(0),
771         output_bytes_(0),
772         float_ops_(0),
773         parameters_(0) {}
774 
SnapshotNodes(int64 step,const std::vector<string> & type_regexes)775   bool SnapshotNodes(int64 step, const std::vector<string>& type_regexes) {
776     run_count_ = 0;
777     exec_micros_ = 0;
778     accelerator_exec_micros_ = 0;
779     cpu_exec_micros_ = 0;
780 
781     requested_bytes_ = 0;
782     peak_bytes_ = 0;
783     residual_bytes_ = 0;
784     output_bytes_ = 0;
785 
786     float_ops_ = 0;
787     parameters_ = 0;
788     op_types_.clear();
789     shapes_.clear();
790     devices_.clear();
791     snapshot_nodes_.clear();
792 
793     step_ = step;
794     std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes);
795 
796     if (nodes.empty()) {
797       return (type_regexes.size() == 1 && type_regexes[0] == ".*");
798     }
799 
800     for (const TFGraphNode* node : nodes) {
801       op_types_.insert(node->op_types().begin(), node->op_types().end());
802 
803       run_count_ += node->run_count(step);
804       exec_micros_ += node->exec_micros(step);
805       accelerator_exec_micros_ += node->accelerator_exec_micros(step);
806       cpu_exec_micros_ += node->cpu_exec_micros(step);
807 
808       requested_bytes_ += node->requested_bytes(step);
809       peak_bytes_ += node->peak_bytes(step);
810       residual_bytes_ += node->residual_bytes(step);
811       output_bytes_ += node->output_bytes(step);
812 
813       float_ops_ += node->float_ops(step);
814       parameters_ += node->parameters();
815       if (node->shape().size() > 0) {
816         shapes_.push_back(node->shape());
817       }
818       devices_.insert(node->canonical_device());
819       snapshot_nodes_[node->name()] = node;
820     }
821     return true;
822   }
823 
step()824   int64 step() const { return step_; }
825 
AddGraphNode(const TFGraphNode * node)826   void AddGraphNode(const TFGraphNode* node) {
827     if (nodes_.find(node->name()) != nodes_.end()) {
828       return;
829     }
830     nodes_[node->name()] = node;
831   }
832 
graph_nodes()833   const std::map<string, const TFGraphNode*>& graph_nodes() const {
834     return snapshot_nodes_;
835   }
836 
name()837   const string& name() const { return name_; }
838 
run_count()839   int64 run_count() const { return run_count_; }
exec_micros()840   int64 exec_micros() const { return exec_micros_; }
accelerator_exec_micros()841   int64 accelerator_exec_micros() const { return accelerator_exec_micros_; }
cpu_exec_micros()842   int64 cpu_exec_micros() const { return cpu_exec_micros_; }
843 
requested_bytes()844   int64 requested_bytes() const { return requested_bytes_; }
peak_bytes()845   int64 peak_bytes() const { return peak_bytes_; }
residual_bytes()846   int64 residual_bytes() const { return residual_bytes_; }
output_bytes()847   int64 output_bytes() const { return output_bytes_; }
848 
float_ops()849   int64 float_ops() const { return float_ops_; }
850 
parameters()851   int64 parameters() const { return parameters_; }
852 
devices()853   const std::set<string>& devices() const { return devices_; }
854 
op_types()855   const std::set<string>& op_types() const { return op_types_; }
856 
shapes()857   const std::vector<std::vector<int64>>& shapes() const { return shapes_; }
858 
859  private:
pick_nodes(const std::vector<string> & type_regexes)860   std::vector<const TFGraphNode*> pick_nodes(
861       const std::vector<string>& type_regexes) {
862     if (type_regexes.empty()) {
863       return {};
864     }
865     std::vector<const TFGraphNode*> ret;
866     if (type_regexes.size() == 1 && type_regexes[0] == ".*") {
867       for (const auto& n : nodes_) {
868         ret.push_back(n.second);
869       }
870       return ret;
871     }
872 
873     for (const string& regex : type_regexes) {
874       for (const auto& n : nodes_) {
875         for (const string& type : n.second->op_types()) {
876           if (RE2::FullMatch(type, regex)) {
877             ret.push_back(n.second);
878             break;
879           }
880         }
881       }
882     }
883     return ret;
884   }
885 
886   const string name_;
887   int64 step_;
888   // Snapshot based on type_regexes
889   std::set<string> op_types_;
890   int64 run_count_;
891   int64 exec_micros_;
892   int64 accelerator_exec_micros_;
893   int64 cpu_exec_micros_;
894 
895   int64 requested_bytes_;
896   int64 peak_bytes_;
897   int64 residual_bytes_;
898   int64 output_bytes_;
899   int64 float_ops_;
900   int64 parameters_;
901   std::set<string> devices_;
902   std::vector<std::vector<int64>> shapes_;
903   std::map<string, const TFGraphNode*> snapshot_nodes_;
904 
905   // Overall data held by the TFMultiGraphNode.
906   std::map<string, const TFGraphNode*> nodes_;
907 };
908 
909 bool IsPlacedOnCPU(const string& device);
910 bool IsPlacedOnAccelerator(const string& device);
911 bool CountAsAcceleratorTime(const string& device);
912 bool CountAsCPUTime(const string& device);
913 bool IsCanonicalDevice(const string& device);
914 
915 }  // namespace tfprof
916 }  // namespace tensorflow
917 
918 #endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
919