1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 18 19 #include <map> 20 #include <set> 21 #include <string> 22 #include <vector> 23 24 #include "tensorflow/core/framework/allocation_description.pb.h" 25 #include "tensorflow/core/framework/attr_value.pb.h" 26 #include "tensorflow/core/framework/node_def.pb.h" 27 #include "tensorflow/core/framework/step_stats.pb.h" 28 #include "tensorflow/core/framework/tensor_description.pb.h" 29 #include "tensorflow/core/framework/tensor_shape.pb.h" 30 #include "tensorflow/core/lib/core/errors.h" 31 #include "tensorflow/core/lib/strings/str_util.h" 32 #include "tensorflow/core/lib/strings/strcat.h" 33 #include "tensorflow/core/platform/regexp.h" 34 #include "tensorflow/core/profiler/tfprof_log.pb.h" 35 #include "tensorflow/core/profiler/tfprof_options.h" 36 37 namespace tensorflow { 38 namespace tfprof { 39 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb); 40 41 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec); 42 43 class TFGraphNode; 44 45 class CallStack { 46 public: 47 class Trace { 48 public: Trace(const CodeDef::Trace * trace,const std::map<int64,string> * id_to_string)49 Trace(const CodeDef::Trace* trace, 50 const std::map<int64, string>* id_to_string) 51 : trace_(trace), id_to_string_(id_to_string) {} 52 lineno()53 const int32 lineno() const { return trace_->lineno(); } file()54 string file() const { 55 // Backward compatible with old proto files. 56 if (!trace_->file().empty()) return trace_->file(); 57 return id_to_string_->at(trace_->file_id()); 58 } function()59 string function() const { 60 // Backward compatible with old proto files. 61 if (!trace_->function().empty()) return trace_->function(); 62 return id_to_string_->at(trace_->function_id()); 63 } func_start_line()64 int32 func_start_line() const { return trace_->func_start_line(); } 65 66 private: 67 const CodeDef::Trace* trace_; 68 const std::map<int64, string>* id_to_string_; 69 }; 70 CallStack(const CodeDef & def,const std::map<int64,string> * id_to_string)71 CallStack(const CodeDef& def, const std::map<int64, string>* id_to_string) 72 : def_(def) { 73 traces_.reserve(def.traces_size()); 74 for (const auto& t : def_.traces()) { 75 traces_.emplace_back(&t, id_to_string); 76 } 77 } 78 code_def()79 const CodeDef& code_def() const { return def_; } traces()80 const std::vector<Trace>& traces() const { return traces_; } 81 82 private: 83 std::vector<Trace> traces_; 84 CodeDef def_; 85 }; 86 87 class ExecStep { 88 public: ExecStep()89 ExecStep() {} 90 91 void AddTimeStats(const string& dev, const NodeExecStats& step_stat); 92 93 void AddMemoryStats(const string& dev, const NodeExecStats& step_stat); 94 run_count()95 int64 run_count() const { return exec_.run_count(); } 96 // The execution time of an op. If it runs on accelerator, then it's 97 // accelerator_exec_micros(). Otherwise, it's CPU time. 98 int64 exec_micros() const; 99 // The accelerator execution time of an op. 0 if not run on accelerator. 100 int64 accelerator_exec_micros() const; 101 // The cpu execution time of an op. 102 int64 cpu_exec_micros() const; 103 op_execs()104 const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs() 105 const { 106 return op_execs_; 107 } cpu_execs()108 const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs() 109 const { 110 return cpu_execs_; 111 } all_start_micros()112 int64 all_start_micros() const { return exec_.all_start_micros(); } latest_end_micros()113 int64 latest_end_micros() const { return exec_.latest_end_micros(); } lastest_schedule_end_micros()114 int64 lastest_schedule_end_micros() const { 115 int64 ret = 0; 116 for (const auto& exec : cpu_execs_) { 117 for (const auto& pair : exec.second) { 118 ret = std::max(ret, pair.first + pair.second); 119 } 120 } 121 return ret; 122 } requested_bytes()123 int64 requested_bytes() const { 124 int64 requested_bytes = 0; 125 for (const ExecMemory& exec : memory_execs_) { 126 requested_bytes += exec.requested_bytes(); 127 } 128 return requested_bytes; 129 } peak_bytes()130 int64 peak_bytes() const { 131 int64 peak_bytes = 0; 132 for (const ExecMemory& exec : memory_execs_) { 133 peak_bytes += exec.peak_bytes(); 134 } 135 return peak_bytes; 136 } residual_bytes()137 int64 residual_bytes() const { 138 int64 residual_bytes = 0; 139 for (const ExecMemory& exec : memory_execs_) { 140 residual_bytes += exec.residual_bytes(); 141 } 142 return residual_bytes; 143 } output_bytes()144 int64 output_bytes() const { 145 int64 output_bytes = 0; 146 for (const ExecMemory& exec : memory_execs_) { 147 output_bytes += exec.output_bytes(); 148 } 149 return output_bytes; 150 } accelerator_temp_bytes()151 int64 accelerator_temp_bytes() const { 152 int64 accelerator_temp_bytes = 0; 153 for (const ExecMemory& exec : memory_execs_) { 154 accelerator_temp_bytes += exec.accelerator_temp_bytes(); 155 } 156 return accelerator_temp_bytes; 157 } host_temp_bytes()158 int64 host_temp_bytes() const { 159 int64 host_temp_bytes = 0; 160 for (const ExecMemory& exec : memory_execs_) { 161 host_temp_bytes += exec.host_temp_bytes(); 162 } 163 return host_temp_bytes; 164 } accelerator_persistent_bytes()165 int64 accelerator_persistent_bytes() const { 166 int64 accelerator_persistent_bytes = 0; 167 for (const ExecMemory& exec : memory_execs_) { 168 accelerator_persistent_bytes += exec.accelerator_persistent_bytes(); 169 } 170 return accelerator_persistent_bytes; 171 } host_persistent_bytes()172 int64 host_persistent_bytes() const { 173 int64 host_persistent_bytes = 0; 174 for (const ExecMemory& exec : memory_execs_) { 175 host_persistent_bytes += exec.host_persistent_bytes(); 176 } 177 return host_persistent_bytes; 178 } allocator_bytes_in_use()179 std::map<int64, int64> allocator_bytes_in_use() const { 180 std::map<int64, int64> bytes_in_use; 181 for (const ExecMemory& exec : memory_execs_) { 182 bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use(); 183 } 184 return bytes_in_use; 185 } 186 allocations()187 const std::vector<AllocationRecord>& allocations() const { 188 return allocations_; 189 } 190 ToProto()191 const ExecProfile& ToProto() { 192 exec_.mutable_accelerator_execs()->clear(); 193 for (const auto& e : accelerator_execs_) { 194 auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first]; 195 for (const auto& p : e.second) { 196 auto* t = exec_time.mutable_times()->Add(); 197 t->add_int64_values(p.first); 198 t->add_int64_values(p.second); 199 } 200 } 201 202 exec_.mutable_cpu_execs()->clear(); 203 for (const auto& e : cpu_execs_) { 204 auto& exec_time = (*exec_.mutable_cpu_execs())[e.first]; 205 for (const auto& p : e.second) { 206 auto* t = exec_time.mutable_times()->Add(); 207 t->add_int64_values(p.first); 208 t->add_int64_values(p.second); 209 } 210 } 211 212 exec_.mutable_devices()->Clear(); 213 exec_.mutable_devices()->Reserve(devices_.size()); 214 for (const string& d : devices_) { 215 exec_.add_devices(d); 216 } 217 exec_.mutable_allocations()->Clear(); 218 for (const auto& r : allocations_) { 219 exec_.add_allocations()->MergeFrom(r); 220 } 221 222 exec_.mutable_memory_execs()->Clear(); 223 for (const auto& m : memory_execs_) { 224 exec_.add_memory_execs()->MergeFrom(m); 225 } 226 return exec_; 227 } 228 FromProto(const ExecProfile & exec)229 void FromProto(const ExecProfile& exec) { 230 exec_.Clear(); 231 exec_.MergeFrom(exec); 232 233 devices_.clear(); 234 devices_.insert(exec.devices().begin(), exec.devices().end()); 235 236 accelerator_execs_.clear(); 237 cpu_execs_.clear(); 238 op_execs_.clear(); 239 240 allocations_.clear(); 241 memory_execs_.clear(); 242 243 for (const auto& exec_time : exec_.accelerator_execs()) { 244 auto& exec = accelerator_execs_[exec_time.first]; 245 auto& op_exec = op_execs_[exec_time.first]; 246 for (const auto& p : exec_time.second.times()) { 247 exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 248 op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 249 } 250 } 251 for (const auto& exec_time : exec_.cpu_execs()) { 252 auto& exec = cpu_execs_[exec_time.first]; 253 auto& op_exec = op_execs_[exec_time.first]; 254 for (const auto& p : exec_time.second.times()) { 255 exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 256 op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 257 } 258 } 259 for (const auto& r : exec_.allocations()) { 260 allocations_.push_back(r); 261 } 262 for (const auto& m : exec_.memory_execs()) { 263 memory_execs_.push_back(m); 264 } 265 } 266 267 private: 268 ExecProfile exec_; 269 // device -> vector of {op_start_micros, op_exec_micros} pairs. 270 // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros} 271 // For accelerator, vector size can be larger than 1, multiple kernel fires 272 // or in tf.while_loop. 273 std::map<string, std::vector<std::pair<int64, int64>>> accelerator_execs_; 274 // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros} 275 // For cpu, vector size can be larger than 1 if in tf.while_loop. 276 std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_; 277 // combines accelerator_execs_ and cpu_execs_. 278 std::map<string, std::vector<std::pair<int64, int64>>> op_execs_; 279 // Each ExecMemory corresponds to one scheduling of the op. Normally, 280 // there are multiple schedulings in while_loop. 281 std::vector<ExecMemory> memory_execs_; 282 // All devices the op is associated with (e.g. gpu:0 (scheduling), 283 // gpu:0:stream:xx (kernel exec), cpu:0 host) 284 std::set<string> devices_; 285 286 // The history of accelerator allocations and deallocations of this step. 287 std::vector<AllocationRecord> allocations_; 288 }; 289 290 #define GRAPH_NODE_BYTES(type) \ 291 do { \ 292 if (execs_.empty()) { \ 293 return 0; \ 294 } \ 295 if (step >= 0) { \ 296 auto exec = execs_.find(step); \ 297 if (exec == execs_.end()) return 0; \ 298 return exec->second.type##_bytes(); \ 299 } \ 300 \ 301 int64 bytes = 0; \ 302 for (const auto& exec : execs_) { \ 303 bytes += exec.second.type##_bytes(); \ 304 } \ 305 return bytes / execs_.size(); \ 306 } while (0) 307 308 class TFGraphNode { 309 public: TFGraphNode(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)310 TFGraphNode(const ProfileNode& node, const ProfileProto& profile, 311 const std::map<int64, string>* id_to_string, 312 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { 313 nodes_map_ = nodes_map; 314 FromProto(node, profile, id_to_string); 315 } 316 TFGraphNode(const NodeDef * node,int64 id,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)317 TFGraphNode(const NodeDef* node, int64 id, 318 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { 319 nodes_map_ = nodes_map; 320 node_.set_id(id); 321 node_.set_name(node->name()); 322 node_.set_op(node->op()); 323 node_.set_float_ops(0); 324 325 for (const auto& attr : node->attr()) { 326 (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second); 327 if (attr.first == "shape" && attr.second.has_shape()) { 328 if (!shape_.empty()) { 329 fprintf(stderr, "Found duplicated shapes!\n"); 330 continue; 331 } 332 shape_ = ShapeProtoToVec(attr.second.shape()); 333 } else if (attr.first == "_output_shapes" && attr.second.has_list()) { 334 if (!output_shapes_.empty()) { 335 fprintf(stderr, "Found duplicated output shapes!\n"); 336 continue; 337 } 338 for (int i = 0; i < attr.second.list().shape_size(); ++i) { 339 output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i)); 340 } 341 } 342 } 343 op_types_.insert(node->op()); 344 } 345 AddInput(const string & input,int64 output_index,int input_idx)346 void AddInput(const string& input, int64 output_index, int input_idx) { 347 inputs_[input_idx] = input; 348 src_output_idx_[input] = output_index; 349 } 350 AddOpType(const string & op_type)351 void AddOpType(const string& op_type) { op_types_.insert(op_type); } 352 353 void AddStepStat(int64 step, const string& device, 354 const NodeExecStats& step_stat); 355 AddFloatOps(int64 float_ops)356 void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); } 357 358 // TODO(xpan): This could take a lot of memory. AddCode(const CodeDef & code,const std::map<int64,string> * id_to_string)359 void AddCode(const CodeDef& code, 360 const std::map<int64, string>* id_to_string) { 361 if (!call_stack_) { 362 call_stack_.reset(new CallStack(code, id_to_string)); 363 } 364 } 365 name()366 const string& name() const { return node_.name(); } id()367 int64 id() const { return node_.id(); } op()368 const string& op() const { return node_.op(); } node()369 const ProfileNode& node() { return node_; } 370 trackable(int64 step)371 bool trackable(int64 step) const { 372 auto exec = execs_.find(step); 373 if (exec == execs_.end()) return false; 374 375 if (exec->second.all_start_micros() == 0) return false; 376 if (node_.canonical_device().empty() || node_.host_device().empty()) { 377 return false; 378 } 379 return true; 380 } 381 ToProto(const std::map<string,std::unique_ptr<TFGraphNode>> & nodes_map)382 const ProfileNode& ToProto( 383 const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) { 384 node_.clear_shape(); 385 node_.mutable_shape()->Reserve(shape().size()); 386 for (int64 s : shape()) { 387 node_.add_shape(s); 388 } 389 390 node_.clear_op_types(); 391 node_.mutable_op_types()->Reserve(op_types().size()); 392 for (const string& t : op_types()) { 393 node_.add_op_types(t); 394 } 395 396 node_.clear_execs(); 397 for (auto& exec : execs_) { 398 auto& exec_pb = (*node_.mutable_execs())[exec.first]; 399 exec_pb.MergeFrom(exec.second.ToProto()); 400 } 401 402 node_.clear_inputs(); 403 for (const auto& inp : inputs_) { 404 (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id(); 405 } 406 407 node_.clear_input_shapes(); 408 for (const auto& s : input_shapes_) { 409 auto& shape = (*node_.mutable_input_shapes())[s.first]; 410 for (int64 d : s.second) { 411 shape.add_int64_values(d); 412 } 413 } 414 415 node_.clear_output_shapes(); 416 for (const auto& s : output_shapes_) { 417 auto& shape = (*node_.mutable_output_shapes())[s.first]; 418 for (int64 d : s.second) { 419 shape.add_int64_values(d); 420 } 421 } 422 423 node_.clear_src_output_index(); 424 for (const auto& s : src_output_idx_) { 425 int64 id = nodes_map.at(s.first)->id(); 426 (*node_.mutable_src_output_index())[id] = s.second; 427 } 428 429 if (call_stack_) { 430 node_.clear_trace(); 431 node_.mutable_trace()->MergeFrom(call_stack_->code_def()); 432 } 433 return node_; 434 } 435 FromProto(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string)436 void FromProto(const ProfileNode& node, const ProfileProto& profile, 437 const std::map<int64, string>* id_to_string) { 438 node_.Clear(); 439 node_.MergeFrom(node); 440 441 call_stack_.reset(new CallStack(node.trace(), id_to_string)); 442 443 op_types_.clear(); 444 op_types_.insert(node_.op_types().begin(), node_.op_types().end()); 445 446 shape_.clear(); 447 for (int64 s : node_.shape()) { 448 shape_.push_back(s); 449 } 450 451 execs_.clear(); 452 for (const auto& exec_pb : node.execs()) { 453 auto& exec = execs_[exec_pb.first]; 454 exec.FromProto(exec_pb.second); 455 } 456 457 inputs_.clear(); 458 for (const auto& inp : node.inputs()) { 459 inputs_[inp.first] = profile.nodes().at(inp.second).name(); 460 } 461 462 input_shapes_.clear(); 463 for (const auto& s : node.input_shapes()) { 464 auto& shape = input_shapes_[s.first]; 465 for (const int64 d : s.second.int64_values()) { 466 shape.push_back(d); 467 } 468 } 469 470 output_shapes_.clear(); 471 for (const auto& s : node.output_shapes()) { 472 auto& shape = output_shapes_[s.first]; 473 for (const int64 d : s.second.int64_values()) { 474 shape.push_back(d); 475 } 476 } 477 478 src_output_idx_.clear(); 479 for (const auto& s : node.src_output_index()) { 480 src_output_idx_[profile.nodes().at(s.first).name()] = s.second; 481 } 482 } 483 inputs()484 const std::map<int32, string>& inputs() const { return inputs_; } 485 486 // Number of times the graph node is executed. When step < 0, the 487 // average number of times executed across all steps. run_count(int64 step)488 int64 run_count(int64 step) const { 489 if (execs_.empty()) { 490 return 0; 491 } 492 if (step >= 0) { 493 auto exec = execs_.find(step); 494 if (exec == execs_.end()) { 495 return 0; 496 } 497 return exec->second.run_count(); 498 } 499 int64 total_run_count = 0; 500 for (const auto& exec : execs_) { 501 total_run_count += exec.second.run_count(); 502 } 503 return total_run_count / execs_.size(); 504 } 505 // This is overall computation time, including both cpu and accelerator. 506 // Note, cpu and accelerator might or might not run in parallel. exec_micros(int64 step)507 int64 exec_micros(int64 step) const { 508 // Empty when no RunMetadata is provided. 509 if (execs_.empty()) { 510 return 0; 511 } 512 if (step >= 0) { 513 auto exec = execs_.find(step); 514 if (exec == execs_.end()) { 515 return 0; 516 } 517 return exec->second.exec_micros(); 518 } 519 520 int64 total_micros = 0; 521 for (const auto& exec : execs_) { 522 total_micros += exec.second.exec_micros(); 523 } 524 return total_micros / execs_.size(); 525 } 526 527 // This is accelerator computation time of a step, or average of 528 // multiple step, when step < 0. accelerator_exec_micros(int64 step)529 int64 accelerator_exec_micros(int64 step) const { 530 // Empty when no RunMetadata is provided. 531 if (execs_.empty()) { 532 return 0; 533 } 534 if (step >= 0) { 535 auto exec = execs_.find(step); 536 if (exec == execs_.end()) { 537 return 0; 538 } 539 return exec->second.accelerator_exec_micros(); 540 } 541 542 int64 total_micros = 0; 543 for (const auto& exec : execs_) { 544 total_micros += exec.second.accelerator_exec_micros(); 545 } 546 return total_micros / execs_.size(); 547 } 548 549 // This is cpu computation time of a step, or average of 550 // multiple step, when step < 0. cpu_exec_micros(int64 step)551 int64 cpu_exec_micros(int64 step) const { 552 // Empty when no RunMetadata is provided. 553 if (execs_.empty()) { 554 return 0; 555 } 556 if (step >= 0) { 557 auto exec = execs_.find(step); 558 if (exec == execs_.end()) { 559 return 0; 560 } 561 return exec->second.cpu_exec_micros(); 562 } 563 564 int64 total_micros = 0; 565 for (const auto& exec : execs_) { 566 total_micros += exec.second.cpu_exec_micros(); 567 } 568 return total_micros / execs_.size(); 569 } 570 requested_bytes(int64 step)571 int64 requested_bytes(int64 step) const { GRAPH_NODE_BYTES(requested); } peak_bytes(int64 step)572 int64 peak_bytes(int64 step) const { GRAPH_NODE_BYTES(peak); } residual_bytes(int64 step)573 int64 residual_bytes(int64 step) const { GRAPH_NODE_BYTES(residual); } output_bytes(int64 step)574 int64 output_bytes(int64 step) const { GRAPH_NODE_BYTES(output); } 575 all_start_micros(int64 step)576 int64 all_start_micros(int64 step) const { 577 auto exec = execs_.find(step); 578 if (exec == execs_.end()) { 579 return 0; 580 } 581 return exec->second.all_start_micros(); 582 } 583 latest_end_micros(int64 step)584 int64 latest_end_micros(int64 step) const { 585 auto exec = execs_.find(step); 586 if (exec == execs_.end()) { 587 return 0; 588 } 589 return exec->second.latest_end_micros(); 590 } 591 lastest_schedule_end_micros(int64 step)592 int64 lastest_schedule_end_micros(int64 step) const { 593 auto exec = execs_.find(step); 594 if (exec == execs_.end()) { 595 return 0; 596 } 597 return exec->second.lastest_schedule_end_micros(); 598 } 599 op_execs(int64 step)600 const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs( 601 int64 step) const { 602 auto exec = execs_.find(step); 603 if (exec == execs_.end()) { 604 return empty_execs_; 605 } 606 return exec->second.op_execs(); 607 } cpu_execs(int64 step)608 const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs( 609 int64 step) const { 610 auto exec = execs_.find(step); 611 if (exec == execs_.end()) { 612 return empty_execs_; 613 } 614 return exec->second.cpu_execs(); 615 } 616 all_op_execs()617 const std::map<int64, ExecStep>& all_op_execs() const { return execs_; } 618 accelerator_temp_bytes(int64 step)619 int64 accelerator_temp_bytes(int64 step) const { 620 auto exec = execs_.find(step); 621 if (exec == execs_.end()) { 622 return 0; 623 } 624 return exec->second.accelerator_temp_bytes(); 625 } host_temp_bytes(int64 step)626 int64 host_temp_bytes(int64 step) const { 627 auto exec = execs_.find(step); 628 if (exec == execs_.end()) { 629 return 0; 630 } 631 return exec->second.host_temp_bytes(); 632 } accelerator_persistent_bytes()633 int64 accelerator_persistent_bytes() const { 634 int64 persistent_bytes = 0; 635 for (const auto& exec : execs_) { 636 persistent_bytes = std::max(persistent_bytes, 637 exec.second.accelerator_persistent_bytes()); 638 } 639 return persistent_bytes; 640 } allocator_bytes_in_use(int64 step)641 const std::map<int64, int64> allocator_bytes_in_use(int64 step) const { 642 auto exec = execs_.find(step); 643 if (exec == execs_.end()) { 644 return empty_bytes_in_use_; 645 } 646 return exec->second.allocator_bytes_in_use(); 647 } 648 allocations(int64 step)649 const std::vector<AllocationRecord>& allocations(int64 step) const { 650 auto exec = execs_.find(step); 651 if (exec == execs_.end()) { 652 return empty_allocations_; 653 } 654 return exec->second.allocations(); 655 } 656 parameters()657 int64 parameters() const { 658 if (!shape().empty()) { 659 int64 params = 1; 660 bool complete_shape = true; 661 for (int64 d : shape()) { 662 // Sometimes parameters could be <0 when a dim is unknown. 663 if (d < 0) { 664 complete_shape = false; 665 break; 666 } 667 params *= d; 668 } 669 if (complete_shape) { 670 return params; 671 } else { 672 fprintf(stderr, "Incomplete shape.\n"); 673 } 674 } 675 return 0; 676 } 677 float_ops(int64 step)678 int64 float_ops(int64 step) const { 679 // If not run, return static analysis. 680 if (execs_.empty()) { 681 return node_.float_ops(); 682 } 683 // Otherwise, return dynamic float_ops. 684 return node_.float_ops() * run_count(step); 685 } call_stack()686 const CallStack* call_stack() { return call_stack_.get(); } canonical_device()687 string canonical_device() const { return node_.canonical_device(); } host_device()688 string host_device() const { return node_.host_device(); } op_types()689 const std::set<string>& op_types() const { return op_types_; } 690 op_attrs(const string & name)691 const AttrValue* op_attrs(const string& name) const { 692 const auto it = node_.attrs().find(name); 693 if (it == node_.attrs().end()) { 694 return nullptr; 695 } 696 return &it->second; 697 } 698 shape()699 const std::vector<int64>& shape() const { return shape_; } 700 output_shapes()701 const std::map<int, std::vector<int64>>& output_shapes() const { 702 return output_shapes_; 703 } 704 input_shapes()705 const std::map<int, std::vector<int64>> input_shapes() const { 706 std::map<int, std::vector<int64>> input_shapes; 707 for (const auto& inp : inputs_) { 708 // Always create an empty vec even if the shape info might be missing. 709 std::vector<int64>& shape_vec = input_shapes[inp.first]; 710 if (!nodes_map_) continue; 711 auto input_it = nodes_map_->find(inp.second); 712 if (input_it == nodes_map_->end()) continue; 713 auto output_it = src_output_idx_.find(inp.second); 714 if (output_it == src_output_idx_.end()) continue; 715 716 const TFGraphNode* input_node = input_it->second.get(); 717 if (!input_node) continue; 718 const auto& output_shapes = input_node->output_shapes(); 719 const auto& output_shape = output_shapes.find(output_it->second); 720 if (output_shape == output_shapes.end()) continue; 721 722 if (output_shape != input_node->output_shapes().end()) { 723 shape_vec.assign(output_shape->second.begin(), 724 output_shape->second.end()); 725 } 726 } 727 return input_shapes; 728 } 729 730 private: 731 // maps graph node name to TFGraphNode. Not owned. 732 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_; 733 // inputs to the node. input index -> input node name. 734 std::map<int, string> inputs_; 735 // The output index of the source node. 736 std::map<string, int32> src_output_idx_; 737 // proto for serialize/deserialized representation of the node. 738 ProfileNode node_; 739 // Python call stack that creates the name. 740 std::unique_ptr<CallStack> call_stack_; 741 // Shape of the node (e.g. Variable) if available. 742 std::vector<int64> shape_; 743 // Won't missing input_idx. But some shapes might be empty (unknown). 744 std::map<int, std::vector<int64>> input_shapes_; 745 // Could miss output_idx if no _output_shapes attr. some shapes can also 746 // be empty. 747 std::map<int, std::vector<int64>> output_shapes_; 748 749 std::set<string> op_types_; 750 751 std::map<int64, ExecStep> execs_; 752 753 // Placeholder for empty cases. 754 std::map<int64, int64> empty_bytes_in_use_; 755 std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_; 756 std::vector<AllocationRecord> empty_allocations_; 757 }; 758 759 class TFMultiGraphNode { 760 public: TFMultiGraphNode(const string & name)761 TFMultiGraphNode(const string& name) 762 : name_(name), 763 step_(-1), 764 run_count_(0), 765 exec_micros_(0), 766 accelerator_exec_micros_(0), 767 cpu_exec_micros_(0), 768 requested_bytes_(0), 769 peak_bytes_(0), 770 residual_bytes_(0), 771 output_bytes_(0), 772 float_ops_(0), 773 parameters_(0) {} 774 SnapshotNodes(int64 step,const std::vector<string> & type_regexes)775 bool SnapshotNodes(int64 step, const std::vector<string>& type_regexes) { 776 run_count_ = 0; 777 exec_micros_ = 0; 778 accelerator_exec_micros_ = 0; 779 cpu_exec_micros_ = 0; 780 781 requested_bytes_ = 0; 782 peak_bytes_ = 0; 783 residual_bytes_ = 0; 784 output_bytes_ = 0; 785 786 float_ops_ = 0; 787 parameters_ = 0; 788 op_types_.clear(); 789 shapes_.clear(); 790 devices_.clear(); 791 snapshot_nodes_.clear(); 792 793 step_ = step; 794 std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes); 795 796 if (nodes.empty()) { 797 return (type_regexes.size() == 1 && type_regexes[0] == ".*"); 798 } 799 800 for (const TFGraphNode* node : nodes) { 801 op_types_.insert(node->op_types().begin(), node->op_types().end()); 802 803 run_count_ += node->run_count(step); 804 exec_micros_ += node->exec_micros(step); 805 accelerator_exec_micros_ += node->accelerator_exec_micros(step); 806 cpu_exec_micros_ += node->cpu_exec_micros(step); 807 808 requested_bytes_ += node->requested_bytes(step); 809 peak_bytes_ += node->peak_bytes(step); 810 residual_bytes_ += node->residual_bytes(step); 811 output_bytes_ += node->output_bytes(step); 812 813 float_ops_ += node->float_ops(step); 814 parameters_ += node->parameters(); 815 if (node->shape().size() > 0) { 816 shapes_.push_back(node->shape()); 817 } 818 devices_.insert(node->canonical_device()); 819 snapshot_nodes_[node->name()] = node; 820 } 821 return true; 822 } 823 step()824 int64 step() const { return step_; } 825 AddGraphNode(const TFGraphNode * node)826 void AddGraphNode(const TFGraphNode* node) { 827 if (nodes_.find(node->name()) != nodes_.end()) { 828 return; 829 } 830 nodes_[node->name()] = node; 831 } 832 graph_nodes()833 const std::map<string, const TFGraphNode*>& graph_nodes() const { 834 return snapshot_nodes_; 835 } 836 name()837 const string& name() const { return name_; } 838 run_count()839 int64 run_count() const { return run_count_; } exec_micros()840 int64 exec_micros() const { return exec_micros_; } accelerator_exec_micros()841 int64 accelerator_exec_micros() const { return accelerator_exec_micros_; } cpu_exec_micros()842 int64 cpu_exec_micros() const { return cpu_exec_micros_; } 843 requested_bytes()844 int64 requested_bytes() const { return requested_bytes_; } peak_bytes()845 int64 peak_bytes() const { return peak_bytes_; } residual_bytes()846 int64 residual_bytes() const { return residual_bytes_; } output_bytes()847 int64 output_bytes() const { return output_bytes_; } 848 float_ops()849 int64 float_ops() const { return float_ops_; } 850 parameters()851 int64 parameters() const { return parameters_; } 852 devices()853 const std::set<string>& devices() const { return devices_; } 854 op_types()855 const std::set<string>& op_types() const { return op_types_; } 856 shapes()857 const std::vector<std::vector<int64>>& shapes() const { return shapes_; } 858 859 private: pick_nodes(const std::vector<string> & type_regexes)860 std::vector<const TFGraphNode*> pick_nodes( 861 const std::vector<string>& type_regexes) { 862 if (type_regexes.empty()) { 863 return {}; 864 } 865 std::vector<const TFGraphNode*> ret; 866 if (type_regexes.size() == 1 && type_regexes[0] == ".*") { 867 for (const auto& n : nodes_) { 868 ret.push_back(n.second); 869 } 870 return ret; 871 } 872 873 for (const string& regex : type_regexes) { 874 for (const auto& n : nodes_) { 875 for (const string& type : n.second->op_types()) { 876 if (RE2::FullMatch(type, regex)) { 877 ret.push_back(n.second); 878 break; 879 } 880 } 881 } 882 } 883 return ret; 884 } 885 886 const string name_; 887 int64 step_; 888 // Snapshot based on type_regexes 889 std::set<string> op_types_; 890 int64 run_count_; 891 int64 exec_micros_; 892 int64 accelerator_exec_micros_; 893 int64 cpu_exec_micros_; 894 895 int64 requested_bytes_; 896 int64 peak_bytes_; 897 int64 residual_bytes_; 898 int64 output_bytes_; 899 int64 float_ops_; 900 int64 parameters_; 901 std::set<string> devices_; 902 std::vector<std::vector<int64>> shapes_; 903 std::map<string, const TFGraphNode*> snapshot_nodes_; 904 905 // Overall data held by the TFMultiGraphNode. 906 std::map<string, const TFGraphNode*> nodes_; 907 }; 908 909 bool IsPlacedOnCPU(const string& device); 910 bool IsPlacedOnAccelerator(const string& device); 911 bool CountAsAcceleratorTime(const string& device); 912 bool CountAsCPUTime(const string& device); 913 bool IsCanonicalDevice(const string& device); 914 915 } // namespace tfprof 916 } // namespace tensorflow 917 918 #endif // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 919