1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/graph/costmodel.h"
17
18 #include <vector>
19 #include "tensorflow/core/framework/allocation_description.pb.h"
20 #include "tensorflow/core/framework/cost_graph.pb.h"
21 #include "tensorflow/core/framework/step_stats.pb.h"
22 #include "tensorflow/core/framework/tensor_description.pb.h"
23 #include "tensorflow/core/graph/graph.h"
24 #include "tensorflow/core/platform/logging.h"
25
26 namespace tensorflow {
27 namespace {
28 const Microseconds kDefaultTimeEstimate(1);
29 const Microseconds kMinTimeEstimate(1);
30 } // namespace
31
SuppressInfrequent()32 void CostModel::SuppressInfrequent() {
33 // Find the median of the non-zero counts, and use half of its value
34 // as the cutoff for a "normal" execution mode node.
35 if (count_.empty()) return;
36 std::vector<int32> non_zero;
37 for (auto v : count_) {
38 if (v > 0) non_zero.push_back(v);
39 }
40 const size_t sz = non_zero.size();
41 if (sz > 0) {
42 std::nth_element(non_zero.begin(), non_zero.begin() + sz / 2,
43 non_zero.end());
44 int32 median_value = non_zero[sz / 2];
45 min_count_ = median_value / 2;
46 VLOG(1) << "num non_zero vals: " << non_zero.size() << " median_value "
47 << median_value;
48 } else {
49 min_count_ = 1;
50 }
51 }
52
MergeFromLocal(const Graph & g,const CostModel & cm)53 void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
54 CHECK(is_global_);
55 CHECK(!cm.is_global());
56 for (const Node* n : g.nodes()) {
57 const int local_id = cm.Id(n);
58 const int global_id = Id(n);
59 if (local_id < 0 || global_id < 0) continue;
60 int num_slots = cm.slot_bytes_[local_id].size();
61 Ensure(global_id, num_slots);
62 count_[global_id] += cm.count_[local_id];
63 time_[global_id] += cm.time_[local_id];
64 if (num_slots > 0) {
65 if (slot_bytes_[global_id].empty()) {
66 slot_bytes_[global_id].resize(num_slots);
67 } else {
68 CHECK_EQ(num_slots, slot_bytes_[global_id].size());
69 }
70 for (int s = 0; s < num_slots; ++s) {
71 slot_bytes_[global_id][s] += cm.slot_bytes_[local_id][s];
72 }
73 }
74 }
75 }
76
MergeFromGlobal(const CostModel & cm)77 void CostModel::MergeFromGlobal(const CostModel& cm) {
78 CHECK(is_global_);
79 CHECK_EQ(true, cm.is_global());
80 const int num_nodes = cm.count_.size();
81 for (int i = num_nodes - 1; i >= 0; --i) {
82 count_[i] += cm.count_[i];
83 time_[i] += cm.time_[i];
84 int num_slots = cm.slot_bytes_[i].size();
85 Ensure(i, num_slots);
86 if (num_slots > 0) {
87 if (slot_bytes_[i].empty()) {
88 slot_bytes_[i].resize(num_slots);
89 } else {
90 CHECK_EQ(num_slots, slot_bytes_[i].size());
91 }
92 for (int s = 0; s < num_slots; ++s) {
93 slot_bytes_[i][s] += cm.slot_bytes_[i][s];
94 }
95 }
96 }
97 }
98
MergeFromStats(const NodeNameToCostIdMap & map,const StepStats & ss)99 void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
100 const StepStats& ss) {
101 CHECK(is_global_);
102 for (auto& ds : ss.dev_stats()) {
103 for (auto& ns : ds.node_stats()) {
104 NodeNameToCostIdMap::const_iterator iter = map.find(ns.node_name());
105 // We don't keep stats for nodes not in the global graph, i.e.
106 // copy/send/recv nodes, feed/fetch, etc.
107 if (iter == map.end()) continue;
108 int32 global_id = iter->second;
109 Ensure(global_id, ns.output_size());
110 int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros();
111 count_[global_id]++;
112 time_[global_id] += elapsed_micros;
113 for (auto& no : ns.output()) {
114 int si = no.slot();
115 if (static_cast<size_t>(si) >= slot_bytes_[global_id].size()) {
116 slot_bytes_[global_id].resize(1 + si);
117 }
118 slot_bytes_[global_id][si] +=
119 no.tensor_description().allocation_description().requested_bytes();
120 }
121 }
122 }
123 }
124
Ensure(int id,int num_outputs)125 void CostModel::Ensure(int id, int num_outputs) {
126 if (slot_bytes_.size() <= static_cast<size_t>(id)) {
127 slot_bytes_.resize(id + 1);
128 count_.resize(id + 1);
129 time_.resize(id + 1);
130 max_mem_usage_.resize(id + 1);
131 max_exec_time_.resize(id + 1);
132 output_port_alloc_ids_.resize(id + 1);
133 }
134 if (num_outputs > 0) {
135 auto perslot = &slot_bytes_[id];
136 auto output_port_alloc_ids = &output_port_alloc_ids_[id];
137 auto max_mem_usage = &max_mem_usage_[id];
138
139 CHECK_LE(perslot->size(), num_outputs);
140 DCHECK_EQ(output_port_alloc_ids->size(), perslot->size());
141 DCHECK_EQ(max_mem_usage->output_port_mem.size(), perslot->size());
142 DCHECK_EQ(max_mem_usage->output_port_shape.size(), perslot->size());
143 DCHECK_EQ(max_mem_usage->output_port_type.size(), perslot->size());
144
145 perslot->resize(num_outputs, Bytes(-1));
146 output_port_alloc_ids->resize(num_outputs, -1);
147 max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
148 max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
149 max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
150 }
151 }
152
SetNumOutputs(const Node * node,int num_outputs)153 void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
154 const int id = Id(node);
155 if (id < 0) return;
156 // Do not resize the number of slots before checking its existing number of
157 // slots.
158 Ensure(id, 0);
159 auto perslot = &slot_bytes_[id];
160 if (!perslot->empty()) {
161 CHECK_EQ(num_outputs, perslot->size())
162 << "Cannot resize slot_bytes, node=" << node->name();
163 }
164 Ensure(id, num_outputs);
165 }
166
RecordCount(const Node * node,int count)167 void CostModel::RecordCount(const Node* node, int count) {
168 const int id = Id(node);
169 if (id < 0) return;
170 CHECK_LT(id, slot_bytes_.size());
171 count_[id] += count;
172 }
173
TotalCount(const Node * node) const174 int32 CostModel::TotalCount(const Node* node) const {
175 const int id = Id(node);
176 if (id < 0) return 0;
177 return (static_cast<size_t>(id) < slot_bytes_.size()) ? count_[id] : 0;
178 }
179
RecordSize(const Node * node,int slot,Bytes bytes)180 void CostModel::RecordSize(const Node* node, int slot, Bytes bytes) {
181 const int id = Id(node);
182 if (id < 0) return;
183 CHECK_LT(id, slot_bytes_.size());
184 auto perslot = &slot_bytes_[id];
185 CHECK_LT(slot, perslot->size());
186 auto v = &(*perslot)[slot];
187 if (*v >= 0) {
188 *v += bytes;
189 } else {
190 *v = bytes;
191 }
192 }
193
TotalBytes(const Node * node,int slot) const194 Bytes CostModel::TotalBytes(const Node* node, int slot) const {
195 const int id = Id(node);
196 if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
197 slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
198 return Bytes(0);
199 }
200 return slot_bytes_[id][slot];
201 }
202
SizeEstimate(const Node * node,int slot) const203 Bytes CostModel::SizeEstimate(const Node* node, int slot) const {
204 int32 count = TotalCount(node);
205 if (count < min_count_) return Bytes(0);
206 return TotalBytes(node, slot) / std::max(1, TotalCount(node));
207 }
208
RecordTime(const Node * node,Microseconds time)209 void CostModel::RecordTime(const Node* node, Microseconds time) {
210 const int id = Id(node);
211 if (id < 0) return;
212 DCHECK(node->IsOp()) << node->DebugString();
213 Ensure(id, node->num_outputs());
214 time_[id] += time;
215 }
216
TotalTime(const Node * node) const217 Microseconds CostModel::TotalTime(const Node* node) const {
218 DCHECK(node->IsOp()) << node->DebugString();
219 const int id = Id(node);
220 if (id < 0 || static_cast<size_t>(id) >= time_.size() ||
221 time_[id] < Microseconds(0)) {
222 return Microseconds(0);
223 }
224 return time_[id];
225 }
226
TimeEstimate(const Node * node) const227 Microseconds CostModel::TimeEstimate(const Node* node) const {
228 int32 count = TotalCount(node);
229 if (count <= min_count_) return kMinTimeEstimate;
230 return std::max(kMinTimeEstimate, TotalTime(node) / std::max(1, count));
231 }
232
CheckInitialized(const Graph & graph) const233 void CostModel::CheckInitialized(const Graph& graph) const {
234 for (const Node* n : graph.op_nodes()) {
235 CHECK(static_cast<size_t>(n->id()) < time_.size() &&
236 time_[n->id()] >= Microseconds(0))
237 << ": no time estimate for " << n->DebugString();
238
239 CHECK(static_cast<size_t>(n->id()) < slot_bytes_.size())
240 << ": no size estimate for " << n->DebugString();
241 const auto& perslot = slot_bytes_[n->id()];
242 for (size_t i = 0; i < perslot.size(); i++) {
243 CHECK_GE(perslot[i], Bytes(0)) << ": no size estimate for output# " << i
244 << " of " << n->DebugString();
245 }
246 }
247 }
248
RecordMaxMemorySize(const Node * node,int output_slot,Bytes bytes,const TensorShapeProto & tensor_shape,const DataType & dtype)249 void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
250 Bytes bytes,
251 const TensorShapeProto& tensor_shape,
252 const DataType& dtype) {
253 const int id = Id(node);
254 if (id < 0) return;
255 if (output_slot >= node->num_outputs()) {
256 LOG(ERROR) << "Unexpected output slot for node " << node->DebugString()
257 << ". Got " << output_slot << " but its num_outputs is "
258 << node->num_outputs();
259 return;
260 }
261 Ensure(id, node->num_outputs());
262 auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
263 // If the memory allocator doesn't track memory usage, let's infer a lower
264 // bound from the tensor shape and its data type.
265 if (bytes.value() < 0) {
266 bytes = MinTensorMemoryUsage(tensor_shape, dtype);
267 }
268 if (bytes.value() > current_max.value()) {
269 current_max = bytes.value();
270 max_mem_usage_[id].output_port_shape[output_slot] = tensor_shape;
271 max_mem_usage_[id].output_port_type[output_slot] = dtype;
272 }
273 }
274
MaxMemorySize(const Node * node,int slot) const275 Bytes CostModel::MaxMemorySize(const Node* node, int slot) const {
276 const int id = Id(node);
277 if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
278 max_mem_usage_[id].output_port_mem.size() <= static_cast<size_t>(slot)) {
279 return Bytes(0);
280 }
281 return max_mem_usage_[id].output_port_mem[slot];
282 }
283
MaxMemoryShape(const Node * node,int slot) const284 const TensorShapeProto& CostModel::MaxMemoryShape(const Node* node,
285 int slot) const {
286 const int id = Id(node);
287 if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
288 max_mem_usage_[id].output_port_shape.size() <=
289 static_cast<size_t>(slot)) {
290 return unknown_shape_;
291 }
292 return max_mem_usage_[id].output_port_shape[slot];
293 }
294
MaxMemoryType(const Node * node,int slot) const295 DataType CostModel::MaxMemoryType(const Node* node, int slot) const {
296 const int id = Id(node);
297 if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
298 max_mem_usage_[id].output_port_type.size() <= static_cast<size_t>(slot)) {
299 return DT_INVALID;
300 }
301 return max_mem_usage_[id].output_port_type[slot];
302 }
303
TempMemorySize(const Node * node) const304 Bytes CostModel::TempMemorySize(const Node* node) const {
305 const int id = Id(node);
306 if (id < 0) {
307 return Bytes(0);
308 }
309 return max_mem_usage_[id].temp_memory_size;
310 }
311
PersistentMemorySize(const Node * node) const312 Bytes CostModel::PersistentMemorySize(const Node* node) const {
313 const int id = Id(node);
314 if (id < 0) {
315 return Bytes(0);
316 }
317 return max_mem_usage_[id].persistent_memory_size;
318 }
319
RecordMemoryStats(const Node * node,const MemoryStats & memory_stats)320 void CostModel::RecordMemoryStats(const Node* node,
321 const MemoryStats& memory_stats) {
322 const int id = Id(node);
323 if (id < 0) return;
324 max_mem_usage_[id].temp_memory_size = memory_stats.temp_memory_size();
325 max_mem_usage_[id].persistent_memory_size =
326 memory_stats.persistent_memory_size();
327 for (int64 alloc_id : memory_stats.persistent_tensor_alloc_ids()) {
328 if (alloc_id > 0) {
329 persistent_alloc_ids_.insert(alloc_id);
330 }
331 }
332 }
333
RecordMaxExecutionTime(const Node * node,Microseconds time)334 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
335 const int id = Id(node);
336 if (id < 0) return;
337 Ensure(id, node->num_outputs());
338 max_exec_time_[id] = std::max(max_exec_time_[id], time);
339 }
340
MaxExecutionTime(const Node * node) const341 Microseconds CostModel::MaxExecutionTime(const Node* node) const {
342 const int id = Id(node);
343 if (id < 0 || static_cast<size_t>(id) >= max_exec_time_.size()) {
344 return Microseconds(0);
345 }
346 return max_exec_time_[id];
347 }
348
RecordAllocationId(const Node * node,int output_slot,int64 alloc_id)349 void CostModel::RecordAllocationId(const Node* node, int output_slot,
350 int64 alloc_id) {
351 const int id = Id(node);
352 if (id < 0) return;
353 Ensure(id, node->num_outputs());
354 output_port_alloc_ids_[id][output_slot] = alloc_id;
355 }
356
AllocationId(const Node * node,int slot) const357 int64 CostModel::AllocationId(const Node* node, int slot) const {
358 const int id = Id(node);
359 if (id < 0 || static_cast<size_t>(id) >= output_port_alloc_ids_.size() ||
360 output_port_alloc_ids_[id].size() <= static_cast<size_t>(slot)) {
361 return -1;
362 }
363 return output_port_alloc_ids_[id][slot];
364 }
365
IsPersistentTensor(const Node * node,int64 alloc_id) const366 bool CostModel::IsPersistentTensor(const Node* node, int64 alloc_id) const {
367 if (persistent_alloc_ids_.count(alloc_id) > 0) {
368 return true;
369 }
370 if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
371 persistent_alloc_ids_by_devices_.end()) {
372 return false;
373 }
374 return persistent_alloc_ids_by_devices_.at(node->assigned_device_name())
375 .count(alloc_id);
376 }
377
CopyTimeEstimate(Bytes b,double network_latency_millis,double estimated_gbps)378 Microseconds CostModel::CopyTimeEstimate(Bytes b, double network_latency_millis,
379 double estimated_gbps) {
380 // TODO(jeff,sanjay): estimate cost based on bandwidth along the
381 // communication path and the type of transport we are using between
382 // devices.
383 //
384 // We assume the copy time follows a linear model:
385 // copy_time = copy_bytes / rate + min_time
386 int64 copy_bytes = b.value();
387 const double bytes_per_usec = estimated_gbps * 1000.0 / 8;
388 const double min_micros = network_latency_millis * 1000.0;
389 return Microseconds(
390 static_cast<int64>(copy_bytes / bytes_per_usec + min_micros));
391 }
392
ComputationTimeEstimate(int64 math_ops)393 Microseconds CostModel::ComputationTimeEstimate(int64 math_ops) {
394 // TODO(jeff,sanjay): Eventually we should pass in the type of device
395 // (GPU vs. CPU) and use that to affect the estimate.
396
397 // We estimate the microseconds using that value. We divide
398 // by 1000 to convert the madd number into microseconds (assuming
399 // roughly 1000 madds per microsecond (~1 GHz for one core)).
400 return Microseconds(math_ops / 1000);
401 }
402
IncrementUpdateTimes()403 void CostModel::IncrementUpdateTimes() { update_times_++; }
404
GetUpdateTimes() const405 int32 CostModel::GetUpdateTimes() const { return update_times_; }
406
407 // ----------------------------------------------------------------------------
408 // InitCostModel
409 // ----------------------------------------------------------------------------
410
411 namespace {
412
AddNodesToCostModel(const Graph & g,CostModel * cost_model)413 static void AddNodesToCostModel(const Graph& g, CostModel* cost_model) {
414 for (Node* n : g.nodes()) {
415 const int num_outputs = n->num_outputs();
416 cost_model->SetNumOutputs(n, num_outputs);
417 for (int output = 0; output < num_outputs; output++) {
418 // Set up an initial bogus estimate for the node's outputs
419 cost_model->RecordSize(n, output, Bytes(1));
420 }
421 }
422 }
423
AssignSizes(const Graph & g,CostModel * cost_model)424 static void AssignSizes(const Graph& g, CostModel* cost_model) {
425 for (const Edge* e : g.edges()) {
426 // Skip if it is a control edge.
427 if (e->IsControlEdge()) {
428 continue;
429 }
430 const Node* src = e->src();
431
432 // TODO(josh11b): Get an estimate from the Op
433 Bytes size(1);
434 cost_model->RecordSize(src, e->src_output(), size);
435 }
436 }
437
438 // This generates an extremely simple initial guess for the
439 // computation cost of each node. For ordinary Ops, its value should quickly
440 // be wiped out by the real runtime measurements. For other Ops we don't
441 // actually generate measurements, so suppression of infrequent Ops ends up
442 // giving them 0 costs. So, this is not of much consequence except perhaps
443 // in tests.
TimeEstimateForNode(CostModel * cost_model,Node * n)444 static Microseconds TimeEstimateForNode(CostModel* cost_model, Node* n) {
445 CHECK(n->IsOp());
446 VLOG(2) << "Node " << n->id() << ": " << n->name()
447 << " type_string: " << n->type_string();
448 if (IsConstant(n) || IsVariable(n)) {
449 return Microseconds(0);
450 }
451 return kDefaultTimeEstimate;
452 }
453
EstimateComputationCosts(const Graph & g,CostModel * cost_model)454 static void EstimateComputationCosts(const Graph& g, CostModel* cost_model) {
455 for (Node* n : g.nodes()) {
456 if (!n->IsOp()) continue;
457 cost_model->RecordTime(n, TimeEstimateForNode(cost_model, n));
458 }
459 }
460
461 } // namespace
462
InitFromGraph(const Graph & g)463 void CostModel::InitFromGraph(const Graph& g) {
464 const int num_node_ids = g.num_node_ids();
465 slot_bytes_.reserve(num_node_ids);
466 count_.reserve(num_node_ids);
467 time_.reserve(num_node_ids);
468 max_mem_usage_.reserve(num_node_ids);
469 max_exec_time_.reserve(num_node_ids);
470 output_port_alloc_ids_.reserve(num_node_ids);
471
472 AddNodesToCostModel(g, this);
473 AssignSizes(g, this);
474 EstimateComputationCosts(g, this);
475 CheckInitialized(g);
476 }
477
AddToCostGraphDef(const Graph * graph,CostGraphDef * cost_graph) const478 void CostModel::AddToCostGraphDef(const Graph* graph,
479 CostGraphDef* cost_graph) const {
480 std::vector<const Edge*> inputs;
481 std::vector<const Edge*> control_inputs;
482 for (const Node* n : graph->nodes()) {
483 CostGraphDef::Node* cnode = cost_graph->add_node();
484 cnode->set_name(n->name());
485 cnode->set_device(n->assigned_device_name());
486 cnode->set_id(Id(n));
487
488 inputs.clear();
489 inputs.resize(n->num_inputs(), nullptr);
490 control_inputs.clear();
491 for (const Edge* e : n->in_edges()) {
492 if (e->IsControlEdge()) {
493 control_inputs.push_back(e);
494 } else {
495 inputs[e->dst_input()] = e;
496 }
497 }
498 std::sort(control_inputs.begin(), control_inputs.end(),
499 [this](Edge const* a, Edge const* b) {
500 return Id(a->src()) < Id(b->src());
501 });
502
503 for (const Edge* e : inputs) {
504 CostGraphDef::Node::InputInfo* input_info = cnode->add_input_info();
505 input_info->set_preceding_node(Id(e->src()));
506 input_info->set_preceding_port(e->src_output());
507 }
508
509 for (int i = 0; i < n->num_outputs(); i++) {
510 CostGraphDef::Node::OutputInfo* output_info = cnode->add_output_info();
511 int64 alloc_id = AllocationId(n, i);
512 int64 alias_to_input = -1;
513 for (const Edge* e : inputs) {
514 int64 input_alloc_id = AllocationId(e->src(), e->src_output());
515 if (input_alloc_id == alloc_id) {
516 alias_to_input = e->dst_input();
517 break;
518 }
519 }
520 output_info->set_alias_input_port(alias_to_input);
521 output_info->set_dtype(MaxMemoryType(n, i));
522 *output_info->mutable_shape() = MaxMemoryShape(n, i);
523 if (alias_to_input < 0 && IsPersistentTensor(n, alloc_id)) {
524 output_info->set_size(0);
525 } else {
526 output_info->set_size(MaxMemorySize(n, i).value());
527 }
528 }
529
530 for (const Edge* e : control_inputs) {
531 cnode->add_control_input(Id(e->src()));
532 }
533
534 cnode->set_temporary_memory_size(TempMemorySize(n).value());
535 cnode->set_persistent_memory_size(PersistentMemorySize(n).value());
536
537 cnode->set_compute_cost(MaxExecutionTime(n).value());
538
539 // For now we treat all send nodes as final.
540 // TODO(yuanbyu): Send nodes for fetches shouldn't be treated as final.
541 cnode->set_is_final(n->IsSend());
542 }
543 }
544
WriteSummaryToLog() const545 void CostModel::WriteSummaryToLog() const {
546 LOG(INFO) << " min_count_=" << min_count_;
547 for (size_t i = 0; i < count_.size(); ++i) {
548 LOG(INFO) << "Node " << i << " count " << count_[i] << " total time "
549 << time_[i] << " avg time "
550 << (time_[i] / (std::max(1, count_[i])));
551 }
552 }
553
MinTensorMemoryUsage(const TensorShapeProto & tensor_shape,const DataType & dtype)554 Bytes CostModel::MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
555 const DataType& dtype) {
556 if (tensor_shape.unknown_rank()) {
557 return Bytes(-1);
558 }
559
560 size_t num_coefficients = 1;
561 for (const TensorShapeProto::Dim& dim : tensor_shape.dim()) {
562 // If the dimension is unknown, it has to be at least 1
563 num_coefficients *= std::max<size_t>(dim.size(), 1);
564 }
565 return Bytes(num_coefficients * DataTypeSize(dtype));
566 }
567
568 } // namespace tensorflow
569