1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 18 19 #include <map> 20 #include <set> 21 #include <string> 22 #include <vector> 23 24 #include "absl/strings/str_format.h" 25 #include "tensorflow/core/framework/allocation_description.pb.h" 26 #include "tensorflow/core/framework/attr_value.pb.h" 27 #include "tensorflow/core/framework/node_def.pb.h" 28 #include "tensorflow/core/framework/step_stats.pb.h" 29 #include "tensorflow/core/framework/tensor_description.pb.h" 30 #include "tensorflow/core/framework/tensor_shape.pb.h" 31 #include "tensorflow/core/lib/core/errors.h" 32 #include "tensorflow/core/platform/regexp.h" 33 #include "tensorflow/core/profiler/tfprof_log.pb.h" 34 #include "tensorflow/core/profiler/tfprof_options.h" 35 36 namespace tensorflow { 37 namespace tfprof { 38 std::vector<int64_t> ShapeProtoToVec(const TensorShapeProto& shape_pb); 39 40 TensorShapeProto VecToShapeProto(const std::vector<int64_t>& shape_vec); 41 42 class TFGraphNode; 43 44 class CallStack { 45 public: 46 class Trace { 47 public: Trace(const CodeDef::Trace * trace,const std::map<int64_t,string> * id_to_string)48 Trace(const CodeDef::Trace* trace, 49 const std::map<int64_t, string>* id_to_string) 50 : trace_(trace), id_to_string_(id_to_string) {} 51 lineno()52 const int32 lineno() const { return trace_->lineno(); } file()53 string file() const { 54 // Backward compatible with old proto files. 55 if (!trace_->file().empty()) return trace_->file(); 56 return id_to_string_->at(trace_->file_id()); 57 } function()58 string function() const { 59 // Backward compatible with old proto files. 60 if (!trace_->function().empty()) return trace_->function(); 61 return id_to_string_->at(trace_->function_id()); 62 } func_start_line()63 int32 func_start_line() const { return trace_->func_start_line(); } 64 65 private: 66 const CodeDef::Trace* trace_; 67 const std::map<int64_t, string>* id_to_string_; 68 }; 69 CallStack(const CodeDef & def,const std::map<int64_t,string> * id_to_string)70 CallStack(const CodeDef& def, const std::map<int64_t, string>* id_to_string) 71 : def_(def) { 72 traces_.reserve(def.traces_size()); 73 for (const auto& t : def_.traces()) { 74 traces_.emplace_back(&t, id_to_string); 75 } 76 } 77 code_def()78 const CodeDef& code_def() const { return def_; } traces()79 const std::vector<Trace>& traces() const { return traces_; } 80 81 private: 82 std::vector<Trace> traces_; 83 CodeDef def_; 84 }; 85 86 class ExecStep { 87 public: ExecStep()88 ExecStep() {} 89 90 void AddTimeStats(const string& dev, const NodeExecStats& step_stat); 91 92 void AddMemoryStats(const string& dev, const NodeExecStats& step_stat); 93 run_count()94 int64_t run_count() const { return exec_.run_count(); } 95 // The execution time of an op. If it runs on accelerator, then it's 96 // accelerator_exec_micros(). Otherwise, it's CPU time. 97 int64_t exec_micros() const; 98 // The accelerator execution time of an op. 0 if not run on accelerator. 99 int64_t accelerator_exec_micros() const; 100 // The cpu execution time of an op. 101 int64_t cpu_exec_micros() const; 102 op_execs()103 const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& op_execs() 104 const { 105 return op_execs_; 106 } cpu_execs()107 const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& cpu_execs() 108 const { 109 return cpu_execs_; 110 } all_start_micros()111 int64_t all_start_micros() const { return exec_.all_start_micros(); } latest_end_micros()112 int64_t latest_end_micros() const { return exec_.latest_end_micros(); } lastest_schedule_end_micros()113 int64_t lastest_schedule_end_micros() const { 114 int64_t ret = 0; 115 for (const auto& exec : cpu_execs_) { 116 for (const auto& pair : exec.second) { 117 ret = std::max(ret, pair.first + pair.second); 118 } 119 } 120 return ret; 121 } requested_bytes()122 int64_t requested_bytes() const { 123 int64_t requested_bytes = 0; 124 for (const ExecMemory& exec : memory_execs_) { 125 requested_bytes += exec.requested_bytes(); 126 } 127 return requested_bytes; 128 } peak_bytes()129 int64_t peak_bytes() const { 130 int64_t peak_bytes = 0; 131 for (const ExecMemory& exec : memory_execs_) { 132 peak_bytes += exec.peak_bytes(); 133 } 134 return peak_bytes; 135 } residual_bytes()136 int64_t residual_bytes() const { 137 int64_t residual_bytes = 0; 138 for (const ExecMemory& exec : memory_execs_) { 139 residual_bytes += exec.residual_bytes(); 140 } 141 return residual_bytes; 142 } output_bytes()143 int64_t output_bytes() const { 144 int64_t output_bytes = 0; 145 for (const ExecMemory& exec : memory_execs_) { 146 output_bytes += exec.output_bytes(); 147 } 148 return output_bytes; 149 } accelerator_temp_bytes()150 int64_t accelerator_temp_bytes() const { 151 int64_t accelerator_temp_bytes = 0; 152 for (const ExecMemory& exec : memory_execs_) { 153 accelerator_temp_bytes += exec.accelerator_temp_bytes(); 154 } 155 return accelerator_temp_bytes; 156 } host_temp_bytes()157 int64_t host_temp_bytes() const { 158 int64_t host_temp_bytes = 0; 159 for (const ExecMemory& exec : memory_execs_) { 160 host_temp_bytes += exec.host_temp_bytes(); 161 } 162 return host_temp_bytes; 163 } accelerator_persistent_bytes()164 int64_t accelerator_persistent_bytes() const { 165 int64_t accelerator_persistent_bytes = 0; 166 for (const ExecMemory& exec : memory_execs_) { 167 accelerator_persistent_bytes += exec.accelerator_persistent_bytes(); 168 } 169 return accelerator_persistent_bytes; 170 } host_persistent_bytes()171 int64_t host_persistent_bytes() const { 172 int64_t host_persistent_bytes = 0; 173 for (const ExecMemory& exec : memory_execs_) { 174 host_persistent_bytes += exec.host_persistent_bytes(); 175 } 176 return host_persistent_bytes; 177 } allocator_bytes_in_use()178 std::map<int64_t, int64_t> allocator_bytes_in_use() const { 179 std::map<int64_t, int64_t> bytes_in_use; 180 for (const ExecMemory& exec : memory_execs_) { 181 bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use(); 182 } 183 return bytes_in_use; 184 } 185 allocations()186 const std::vector<AllocationRecord>& allocations() const { 187 return allocations_; 188 } 189 ToProto()190 const ExecProfile& ToProto() { 191 exec_.mutable_accelerator_execs()->clear(); 192 for (const auto& e : accelerator_execs_) { 193 auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first]; 194 for (const auto& p : e.second) { 195 auto* t = exec_time.mutable_times()->Add(); 196 t->add_int64_values(p.first); 197 t->add_int64_values(p.second); 198 } 199 } 200 201 exec_.mutable_cpu_execs()->clear(); 202 for (const auto& e : cpu_execs_) { 203 auto& exec_time = (*exec_.mutable_cpu_execs())[e.first]; 204 for (const auto& p : e.second) { 205 auto* t = exec_time.mutable_times()->Add(); 206 t->add_int64_values(p.first); 207 t->add_int64_values(p.second); 208 } 209 } 210 211 exec_.mutable_devices()->Clear(); 212 exec_.mutable_devices()->Reserve(devices_.size()); 213 for (const string& d : devices_) { 214 exec_.add_devices(d); 215 } 216 exec_.mutable_allocations()->Clear(); 217 for (const auto& r : allocations_) { 218 exec_.add_allocations()->MergeFrom(r); 219 } 220 221 exec_.mutable_memory_execs()->Clear(); 222 for (const auto& m : memory_execs_) { 223 exec_.add_memory_execs()->MergeFrom(m); 224 } 225 return exec_; 226 } 227 FromProto(const ExecProfile & exec)228 void FromProto(const ExecProfile& exec) { 229 exec_.Clear(); 230 exec_.MergeFrom(exec); 231 232 devices_.clear(); 233 devices_.insert(exec.devices().begin(), exec.devices().end()); 234 235 accelerator_execs_.clear(); 236 cpu_execs_.clear(); 237 op_execs_.clear(); 238 239 allocations_.clear(); 240 memory_execs_.clear(); 241 242 for (const auto& exec_time : exec_.accelerator_execs()) { 243 auto& exec = accelerator_execs_[exec_time.first]; 244 auto& op_exec = op_execs_[exec_time.first]; 245 for (const auto& p : exec_time.second.times()) { 246 exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 247 op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 248 } 249 } 250 for (const auto& exec_time : exec_.cpu_execs()) { 251 auto& exec = cpu_execs_[exec_time.first]; 252 auto& op_exec = op_execs_[exec_time.first]; 253 for (const auto& p : exec_time.second.times()) { 254 exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 255 op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); 256 } 257 } 258 for (const auto& r : exec_.allocations()) { 259 allocations_.push_back(r); 260 } 261 for (const auto& m : exec_.memory_execs()) { 262 memory_execs_.push_back(m); 263 } 264 } 265 266 private: 267 ExecProfile exec_; 268 // device -> vector of {op_start_micros, op_exec_micros} pairs. 269 // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros} 270 // For accelerator, vector size can be larger than 1, multiple kernel fires 271 // or in tf.while_loop. 272 std::map<string, std::vector<std::pair<int64_t, int64_t>>> accelerator_execs_; 273 // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros} 274 // For cpu, vector size can be larger than 1 if in tf.while_loop. 275 std::map<string, std::vector<std::pair<int64_t, int64_t>>> cpu_execs_; 276 // combines accelerator_execs_ and cpu_execs_. 277 std::map<string, std::vector<std::pair<int64_t, int64_t>>> op_execs_; 278 // Each ExecMemory corresponds to one scheduling of the op. Normally, 279 // there are multiple schedulings in while_loop. 280 std::vector<ExecMemory> memory_execs_; 281 // All devices the op is associated with (e.g. gpu:0 (scheduling), 282 // gpu:0:stream:xx (kernel exec), cpu:0 host) 283 std::set<string> devices_; 284 285 // The history of accelerator allocations and deallocations of this step. 286 std::vector<AllocationRecord> allocations_; 287 }; 288 289 #define GRAPH_NODE_BYTES(type) \ 290 do { \ 291 if (execs_.empty()) { \ 292 return 0; \ 293 } \ 294 if (step >= 0) { \ 295 auto exec = execs_.find(step); \ 296 if (exec == execs_.end()) return 0; \ 297 return exec->second.type##_bytes(); \ 298 } \ 299 \ 300 int64_t bytes = 0; \ 301 for (const auto& exec : execs_) { \ 302 bytes += exec.second.type##_bytes(); \ 303 } \ 304 return bytes / execs_.size(); \ 305 } while (0) 306 307 class TFGraphNode { 308 public: TFGraphNode(const ProfileNode & node,const ProfileProto & profile,const std::map<int64_t,string> * id_to_string,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)309 TFGraphNode(const ProfileNode& node, const ProfileProto& profile, 310 const std::map<int64_t, string>* id_to_string, 311 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { 312 nodes_map_ = nodes_map; 313 FromProto(node, profile, id_to_string); 314 } 315 TFGraphNode(const NodeDef * node,int64_t id,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)316 TFGraphNode(const NodeDef* node, int64_t id, 317 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { 318 nodes_map_ = nodes_map; 319 node_.set_id(id); 320 node_.set_name(node->name()); 321 node_.set_op(node->op()); 322 node_.set_float_ops(0); 323 324 for (const auto& attr : node->attr()) { 325 (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second); 326 if (attr.first == "shape" && attr.second.has_shape()) { 327 if (!shape_.empty()) { 328 absl::FPrintF(stderr, "Found duplicated shapes!\n"); 329 continue; 330 } 331 shape_ = ShapeProtoToVec(attr.second.shape()); 332 } else if (attr.first == "_output_shapes" && attr.second.has_list()) { 333 if (!output_shapes_.empty()) { 334 absl::FPrintF(stderr, "Found duplicated output shapes!\n"); 335 continue; 336 } 337 for (int i = 0; i < attr.second.list().shape_size(); ++i) { 338 output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i)); 339 } 340 } 341 } 342 op_types_.insert(node->op()); 343 } 344 AddInput(const string & input,int64_t output_index,int input_idx)345 void AddInput(const string& input, int64_t output_index, int input_idx) { 346 inputs_[input_idx] = input; 347 src_output_idx_[input] = output_index; 348 } 349 AddOpType(const string & op_type)350 void AddOpType(const string& op_type) { op_types_.insert(op_type); } 351 352 void AddStepStat(int64_t step, const string& device, 353 const NodeExecStats& step_stat); 354 AddFloatOps(int64_t float_ops)355 void AddFloatOps(int64_t float_ops) { node_.set_float_ops(float_ops); } 356 357 // TODO(xpan): This could take a lot of memory. AddCode(const CodeDef & code,const std::map<int64_t,string> * id_to_string)358 void AddCode(const CodeDef& code, 359 const std::map<int64_t, string>* id_to_string) { 360 if (!call_stack_) { 361 call_stack_.reset(new CallStack(code, id_to_string)); 362 } 363 } 364 name()365 const string& name() const { return node_.name(); } id()366 int64_t id() const { return node_.id(); } op()367 const string& op() const { return node_.op(); } node()368 const ProfileNode& node() { return node_; } 369 trackable(int64_t step)370 bool trackable(int64_t step) const { 371 auto exec = execs_.find(step); 372 if (exec == execs_.end()) return false; 373 374 if (exec->second.all_start_micros() == 0) return false; 375 if (node_.canonical_device().empty() || node_.host_device().empty()) { 376 return false; 377 } 378 return true; 379 } 380 ToProto(const std::map<string,std::unique_ptr<TFGraphNode>> & nodes_map)381 const ProfileNode& ToProto( 382 const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) { 383 node_.clear_shape(); 384 node_.mutable_shape()->Reserve(shape().size()); 385 for (int64_t s : shape()) { 386 node_.add_shape(s); 387 } 388 389 node_.clear_op_types(); 390 node_.mutable_op_types()->Reserve(op_types().size()); 391 for (const string& t : op_types()) { 392 node_.add_op_types(t); 393 } 394 395 node_.clear_execs(); 396 for (auto& exec : execs_) { 397 auto& exec_pb = (*node_.mutable_execs())[exec.first]; 398 exec_pb.MergeFrom(exec.second.ToProto()); 399 } 400 401 node_.clear_inputs(); 402 for (const auto& inp : inputs_) { 403 (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id(); 404 } 405 406 node_.clear_input_shapes(); 407 for (const auto& s : input_shapes_) { 408 auto& shape = (*node_.mutable_input_shapes())[s.first]; 409 for (int64_t d : s.second) { 410 shape.add_int64_values(d); 411 } 412 } 413 414 node_.clear_output_shapes(); 415 for (const auto& s : output_shapes_) { 416 auto& shape = (*node_.mutable_output_shapes())[s.first]; 417 for (int64_t d : s.second) { 418 shape.add_int64_values(d); 419 } 420 } 421 422 node_.clear_src_output_index(); 423 for (const auto& s : src_output_idx_) { 424 int64_t id = nodes_map.at(s.first)->id(); 425 (*node_.mutable_src_output_index())[id] = s.second; 426 } 427 428 if (call_stack_) { 429 node_.clear_trace(); 430 node_.mutable_trace()->MergeFrom(call_stack_->code_def()); 431 } 432 return node_; 433 } 434 FromProto(const ProfileNode & node,const ProfileProto & profile,const std::map<int64_t,string> * id_to_string)435 void FromProto(const ProfileNode& node, const ProfileProto& profile, 436 const std::map<int64_t, string>* id_to_string) { 437 node_.Clear(); 438 node_.MergeFrom(node); 439 440 call_stack_.reset(new CallStack(node.trace(), id_to_string)); 441 442 op_types_.clear(); 443 op_types_.insert(node_.op_types().begin(), node_.op_types().end()); 444 445 shape_.clear(); 446 for (int64_t s : node_.shape()) { 447 shape_.push_back(s); 448 } 449 450 execs_.clear(); 451 for (const auto& exec_pb : node.execs()) { 452 auto& exec = execs_[exec_pb.first]; 453 exec.FromProto(exec_pb.second); 454 } 455 456 inputs_.clear(); 457 for (const auto& inp : node.inputs()) { 458 inputs_[inp.first] = profile.nodes().at(inp.second).name(); 459 } 460 461 input_shapes_.clear(); 462 for (const auto& s : node.input_shapes()) { 463 auto& shape = input_shapes_[s.first]; 464 for (const int64_t d : s.second.int64_values()) { 465 shape.push_back(d); 466 } 467 } 468 469 output_shapes_.clear(); 470 for (const auto& s : node.output_shapes()) { 471 auto& shape = output_shapes_[s.first]; 472 for (const int64_t d : s.second.int64_values()) { 473 shape.push_back(d); 474 } 475 } 476 477 src_output_idx_.clear(); 478 for (const auto& s : node.src_output_index()) { 479 src_output_idx_[profile.nodes().at(s.first).name()] = s.second; 480 } 481 } 482 inputs()483 const std::map<int32, string>& inputs() const { return inputs_; } 484 485 // Number of times the graph node is executed. When step < 0, the 486 // average number of times executed across all steps. run_count(int64_t step)487 int64_t run_count(int64_t step) const { 488 if (execs_.empty()) { 489 return 0; 490 } 491 if (step >= 0) { 492 auto exec = execs_.find(step); 493 if (exec == execs_.end()) { 494 return 0; 495 } 496 return exec->second.run_count(); 497 } 498 int64_t total_run_count = 0; 499 for (const auto& exec : execs_) { 500 total_run_count += exec.second.run_count(); 501 } 502 return total_run_count / execs_.size(); 503 } 504 // This is overall computation time, including both cpu and accelerator. 505 // Note, cpu and accelerator might or might not run in parallel. exec_micros(int64_t step)506 int64_t exec_micros(int64_t step) const { 507 // Empty when no RunMetadata is provided. 508 if (execs_.empty()) { 509 return 0; 510 } 511 if (step >= 0) { 512 auto exec = execs_.find(step); 513 if (exec == execs_.end()) { 514 return 0; 515 } 516 return exec->second.exec_micros(); 517 } 518 519 int64_t total_micros = 0; 520 for (const auto& exec : execs_) { 521 total_micros += exec.second.exec_micros(); 522 } 523 return total_micros / execs_.size(); 524 } 525 526 // This is accelerator computation time of a step, or average of 527 // multiple step, when step < 0. accelerator_exec_micros(int64_t step)528 int64_t accelerator_exec_micros(int64_t step) const { 529 // Empty when no RunMetadata is provided. 530 if (execs_.empty()) { 531 return 0; 532 } 533 if (step >= 0) { 534 auto exec = execs_.find(step); 535 if (exec == execs_.end()) { 536 return 0; 537 } 538 return exec->second.accelerator_exec_micros(); 539 } 540 541 int64_t total_micros = 0; 542 for (const auto& exec : execs_) { 543 total_micros += exec.second.accelerator_exec_micros(); 544 } 545 return total_micros / execs_.size(); 546 } 547 548 // This is cpu computation time of a step, or average of 549 // multiple step, when step < 0. cpu_exec_micros(int64_t step)550 int64_t cpu_exec_micros(int64_t step) const { 551 // Empty when no RunMetadata is provided. 552 if (execs_.empty()) { 553 return 0; 554 } 555 if (step >= 0) { 556 auto exec = execs_.find(step); 557 if (exec == execs_.end()) { 558 return 0; 559 } 560 return exec->second.cpu_exec_micros(); 561 } 562 563 int64_t total_micros = 0; 564 for (const auto& exec : execs_) { 565 total_micros += exec.second.cpu_exec_micros(); 566 } 567 return total_micros / execs_.size(); 568 } 569 requested_bytes(int64_t step)570 int64_t requested_bytes(int64_t step) const { GRAPH_NODE_BYTES(requested); } peak_bytes(int64_t step)571 int64_t peak_bytes(int64_t step) const { GRAPH_NODE_BYTES(peak); } residual_bytes(int64_t step)572 int64_t residual_bytes(int64_t step) const { GRAPH_NODE_BYTES(residual); } output_bytes(int64_t step)573 int64_t output_bytes(int64_t step) const { GRAPH_NODE_BYTES(output); } 574 all_start_micros(int64_t step)575 int64_t all_start_micros(int64_t step) const { 576 auto exec = execs_.find(step); 577 if (exec == execs_.end()) { 578 return 0; 579 } 580 return exec->second.all_start_micros(); 581 } 582 latest_end_micros(int64_t step)583 int64_t latest_end_micros(int64_t step) const { 584 auto exec = execs_.find(step); 585 if (exec == execs_.end()) { 586 return 0; 587 } 588 return exec->second.latest_end_micros(); 589 } 590 lastest_schedule_end_micros(int64_t step)591 int64_t lastest_schedule_end_micros(int64_t step) const { 592 auto exec = execs_.find(step); 593 if (exec == execs_.end()) { 594 return 0; 595 } 596 return exec->second.lastest_schedule_end_micros(); 597 } 598 op_execs(int64_t step)599 const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& op_execs( 600 int64_t step) const { 601 auto exec = execs_.find(step); 602 if (exec == execs_.end()) { 603 return empty_execs_; 604 } 605 return exec->second.op_execs(); 606 } cpu_execs(int64_t step)607 const std::map<string, std::vector<std::pair<int64_t, int64_t>>>& cpu_execs( 608 int64_t step) const { 609 auto exec = execs_.find(step); 610 if (exec == execs_.end()) { 611 return empty_execs_; 612 } 613 return exec->second.cpu_execs(); 614 } 615 all_op_execs()616 const std::map<int64_t, ExecStep>& all_op_execs() const { return execs_; } 617 accelerator_temp_bytes(int64_t step)618 int64_t accelerator_temp_bytes(int64_t step) const { 619 auto exec = execs_.find(step); 620 if (exec == execs_.end()) { 621 return 0; 622 } 623 return exec->second.accelerator_temp_bytes(); 624 } host_temp_bytes(int64_t step)625 int64_t host_temp_bytes(int64_t step) const { 626 auto exec = execs_.find(step); 627 if (exec == execs_.end()) { 628 return 0; 629 } 630 return exec->second.host_temp_bytes(); 631 } accelerator_persistent_bytes()632 int64_t accelerator_persistent_bytes() const { 633 int64_t persistent_bytes = 0; 634 for (const auto& exec : execs_) { 635 persistent_bytes = std::max(persistent_bytes, 636 exec.second.accelerator_persistent_bytes()); 637 } 638 return persistent_bytes; 639 } allocator_bytes_in_use(int64_t step)640 const std::map<int64_t, int64_t> allocator_bytes_in_use(int64_t step) const { 641 auto exec = execs_.find(step); 642 if (exec == execs_.end()) { 643 return empty_bytes_in_use_; 644 } 645 return exec->second.allocator_bytes_in_use(); 646 } 647 allocations(int64_t step)648 const std::vector<AllocationRecord>& allocations(int64_t step) const { 649 auto exec = execs_.find(step); 650 if (exec == execs_.end()) { 651 return empty_allocations_; 652 } 653 return exec->second.allocations(); 654 } 655 parameters()656 int64_t parameters() const { 657 if (!shape().empty()) { 658 int64_t params = 1; 659 bool complete_shape = true; 660 for (int64_t d : shape()) { 661 // Sometimes parameters could be <0 when a dim is unknown. 662 if (d < 0) { 663 complete_shape = false; 664 break; 665 } 666 params *= d; 667 } 668 if (complete_shape) { 669 return params; 670 } else { 671 absl::FPrintF(stderr, "Incomplete shape.\n"); 672 } 673 } 674 return 0; 675 } 676 float_ops(int64_t step)677 int64_t float_ops(int64_t step) const { 678 // If not run, return static analysis. 679 if (execs_.empty()) { 680 return node_.float_ops(); 681 } 682 // Otherwise, return dynamic float_ops. 683 return node_.float_ops() * run_count(step); 684 } call_stack()685 const CallStack* call_stack() { return call_stack_.get(); } canonical_device()686 string canonical_device() const { return node_.canonical_device(); } host_device()687 string host_device() const { return node_.host_device(); } op_types()688 const std::set<string>& op_types() const { return op_types_; } 689 op_attrs(const string & name)690 const AttrValue* op_attrs(const string& name) const { 691 const auto it = node_.attrs().find(name); 692 if (it == node_.attrs().end()) { 693 return nullptr; 694 } 695 return &it->second; 696 } 697 shape()698 const std::vector<int64_t>& shape() const { return shape_; } 699 output_shapes()700 const std::map<int, std::vector<int64_t>>& output_shapes() const { 701 return output_shapes_; 702 } 703 input_shapes()704 const std::map<int, std::vector<int64_t>> input_shapes() const { 705 std::map<int, std::vector<int64_t>> input_shapes; 706 for (const auto& inp : inputs_) { 707 // Always create an empty vec even if the shape info might be missing. 708 std::vector<int64_t>& shape_vec = input_shapes[inp.first]; 709 if (!nodes_map_) continue; 710 auto input_it = nodes_map_->find(inp.second); 711 if (input_it == nodes_map_->end()) continue; 712 auto output_it = src_output_idx_.find(inp.second); 713 if (output_it == src_output_idx_.end()) continue; 714 715 const TFGraphNode* input_node = input_it->second.get(); 716 if (!input_node) continue; 717 const auto& output_shapes = input_node->output_shapes(); 718 const auto& output_shape = output_shapes.find(output_it->second); 719 if (output_shape == output_shapes.end()) continue; 720 721 if (output_shape != input_node->output_shapes().end()) { 722 shape_vec.assign(output_shape->second.begin(), 723 output_shape->second.end()); 724 } 725 } 726 return input_shapes; 727 } 728 729 private: 730 // maps graph node name to TFGraphNode. Not owned. 731 const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_; 732 // inputs to the node. input index -> input node name. 733 std::map<int, string> inputs_; 734 // The output index of the source node. 735 std::map<string, int32> src_output_idx_; 736 // proto for serialize/deserialized representation of the node. 737 ProfileNode node_; 738 // Python call stack that creates the name. 739 std::unique_ptr<CallStack> call_stack_; 740 // Shape of the node (e.g. Variable) if available. 741 std::vector<int64_t> shape_; 742 // Won't missing input_idx. But some shapes might be empty (unknown). 743 std::map<int, std::vector<int64_t>> input_shapes_; 744 // Could miss output_idx if no _output_shapes attr. some shapes can also 745 // be empty. 746 std::map<int, std::vector<int64_t>> output_shapes_; 747 748 std::set<string> op_types_; 749 750 std::map<int64_t, ExecStep> execs_; 751 752 // Placeholder for empty cases. 753 std::map<int64_t, int64_t> empty_bytes_in_use_; 754 std::map<string, std::vector<std::pair<int64_t, int64_t>>> empty_execs_; 755 std::vector<AllocationRecord> empty_allocations_; 756 }; 757 758 class TFMultiGraphNode { 759 public: TFMultiGraphNode(const string & name)760 TFMultiGraphNode(const string& name) 761 : name_(name), 762 step_(-1), 763 run_count_(0), 764 exec_micros_(0), 765 accelerator_exec_micros_(0), 766 cpu_exec_micros_(0), 767 requested_bytes_(0), 768 peak_bytes_(0), 769 residual_bytes_(0), 770 output_bytes_(0), 771 float_ops_(0), 772 parameters_(0) {} 773 SnapshotNodes(int64_t step,const std::vector<string> & type_regexes)774 bool SnapshotNodes(int64_t step, const std::vector<string>& type_regexes) { 775 run_count_ = 0; 776 exec_micros_ = 0; 777 accelerator_exec_micros_ = 0; 778 cpu_exec_micros_ = 0; 779 780 requested_bytes_ = 0; 781 peak_bytes_ = 0; 782 residual_bytes_ = 0; 783 output_bytes_ = 0; 784 785 float_ops_ = 0; 786 parameters_ = 0; 787 op_types_.clear(); 788 shapes_.clear(); 789 devices_.clear(); 790 snapshot_nodes_.clear(); 791 792 step_ = step; 793 std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes); 794 795 if (nodes.empty()) { 796 return (type_regexes.size() == 1 && type_regexes[0] == ".*"); 797 } 798 799 for (const TFGraphNode* node : nodes) { 800 op_types_.insert(node->op_types().begin(), node->op_types().end()); 801 802 run_count_ += node->run_count(step); 803 exec_micros_ += node->exec_micros(step); 804 accelerator_exec_micros_ += node->accelerator_exec_micros(step); 805 cpu_exec_micros_ += node->cpu_exec_micros(step); 806 807 requested_bytes_ += node->requested_bytes(step); 808 peak_bytes_ += node->peak_bytes(step); 809 residual_bytes_ += node->residual_bytes(step); 810 output_bytes_ += node->output_bytes(step); 811 812 float_ops_ += node->float_ops(step); 813 parameters_ += node->parameters(); 814 if (node->shape().size() > 0) { 815 shapes_.push_back(node->shape()); 816 } 817 devices_.insert(node->canonical_device()); 818 snapshot_nodes_[node->name()] = node; 819 } 820 return true; 821 } 822 step()823 int64_t step() const { return step_; } 824 AddGraphNode(const TFGraphNode * node)825 void AddGraphNode(const TFGraphNode* node) { 826 if (nodes_.find(node->name()) != nodes_.end()) { 827 return; 828 } 829 nodes_[node->name()] = node; 830 } 831 graph_nodes()832 const std::map<string, const TFGraphNode*>& graph_nodes() const { 833 return snapshot_nodes_; 834 } 835 name()836 const string& name() const { return name_; } 837 run_count()838 int64_t run_count() const { return run_count_; } exec_micros()839 int64_t exec_micros() const { return exec_micros_; } accelerator_exec_micros()840 int64_t accelerator_exec_micros() const { return accelerator_exec_micros_; } cpu_exec_micros()841 int64_t cpu_exec_micros() const { return cpu_exec_micros_; } 842 requested_bytes()843 int64_t requested_bytes() const { return requested_bytes_; } peak_bytes()844 int64_t peak_bytes() const { return peak_bytes_; } residual_bytes()845 int64_t residual_bytes() const { return residual_bytes_; } output_bytes()846 int64_t output_bytes() const { return output_bytes_; } 847 float_ops()848 int64_t float_ops() const { return float_ops_; } 849 parameters()850 int64_t parameters() const { return parameters_; } 851 devices()852 const std::set<string>& devices() const { return devices_; } 853 op_types()854 const std::set<string>& op_types() const { return op_types_; } 855 shapes()856 const std::vector<std::vector<int64_t>>& shapes() const { return shapes_; } 857 858 private: pick_nodes(const std::vector<string> & type_regexes)859 std::vector<const TFGraphNode*> pick_nodes( 860 const std::vector<string>& type_regexes) { 861 if (type_regexes.empty()) { 862 return {}; 863 } 864 std::vector<const TFGraphNode*> ret; 865 if (type_regexes.size() == 1 && type_regexes[0] == ".*") { 866 for (const auto& n : nodes_) { 867 ret.push_back(n.second); 868 } 869 return ret; 870 } 871 872 for (const string& regex : type_regexes) { 873 for (const auto& n : nodes_) { 874 for (const string& type : n.second->op_types()) { 875 if (RE2::FullMatch(type, regex)) { 876 ret.push_back(n.second); 877 break; 878 } 879 } 880 } 881 } 882 return ret; 883 } 884 885 const string name_; 886 int64_t step_; 887 // Snapshot based on type_regexes 888 std::set<string> op_types_; 889 int64_t run_count_; 890 int64_t exec_micros_; 891 int64_t accelerator_exec_micros_; 892 int64_t cpu_exec_micros_; 893 894 int64_t requested_bytes_; 895 int64_t peak_bytes_; 896 int64_t residual_bytes_; 897 int64_t output_bytes_; 898 int64_t float_ops_; 899 int64_t parameters_; 900 std::set<string> devices_; 901 std::vector<std::vector<int64_t>> shapes_; 902 std::map<string, const TFGraphNode*> snapshot_nodes_; 903 904 // Overall data held by the TFMultiGraphNode. 905 std::map<string, const TFGraphNode*> nodes_; 906 }; 907 908 bool IsPlacedOnCPU(const string& device); 909 bool IsPlacedOnAccelerator(const string& device); 910 bool CountAsAcceleratorTime(const string& device); 911 bool CountAsCPUTime(const string& device); 912 bool IsCanonicalDevice(const string& device); 913 914 } // namespace tfprof 915 } // namespace tensorflow 916 917 #endif // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_ 918