1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/grappler/costs/utils.h"
17
18 #include <stddef.h>
19 #include <utility>
20
21 #include "third_party/eigen3/Eigen/Core"
22
23 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
25 #include "tensorflow/core/framework/allocation_description.pb.h"
26 #include "tensorflow/core/framework/attr_value.pb.h"
27 #include "tensorflow/core/framework/op.h"
28 #include "tensorflow/core/framework/op_def.pb.h"
29 #include "tensorflow/core/framework/step_stats.pb.h"
30 #include "tensorflow/core/framework/tensor.pb.h"
31 #include "tensorflow/core/framework/tensor_description.pb.h"
32 #include "tensorflow/core/framework/tensor_shape.pb.h"
33 #include "tensorflow/core/framework/types.pb.h"
34 #include "tensorflow/core/graph/graph.h"
35 #include "tensorflow/core/graph/tensor_id.h"
36 #include "tensorflow/core/grappler/clusters/utils.h"
37 #include "tensorflow/core/grappler/utils.h"
38 #include "tensorflow/core/lib/core/bits.h"
39 #include "tensorflow/core/lib/strings/numbers.h"
40 #include "tensorflow/core/lib/strings/strcat.h"
41 #include "tensorflow/core/platform/byte_order.h"
42 #include "tensorflow/core/platform/env.h"
43 #include "tensorflow/core/platform/logging.h"
44 #include "tensorflow/core/platform/protobuf.h"
45 #include "tensorflow/core/protobuf/config.pb.h"
46 #include "tensorflow/core/util/device_name_utils.h"
47
48 namespace tensorflow {
49 namespace grappler {
50
UnknownInput()51 static OpInfo::TensorProperties UnknownInput() {
52 OpInfo::TensorProperties input;
53 input.set_dtype(DataType::DT_INVALID);
54 input.mutable_shape()->set_unknown_rank(true);
55 return input;
56 }
57
ExtractTensors(const AttrValue & attr_value)58 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
59 std::vector<TensorProto> tensors;
60 switch (attr_value.value_case()) {
61 case AttrValue::kTensor: {
62 tensors.push_back(attr_value.tensor());
63 break;
64 }
65 case AttrValue::kList: {
66 for (const auto& tensor_proto : attr_value.list().tensor()) {
67 tensors.push_back(tensor_proto);
68 }
69 break;
70 }
71 default: {
72 }
73 }
74 return tensors;
75 }
76
77 // Annotate the op_info inputs with extra information when possible (e.g. the
78 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)79 static void ExtractExtraProperties(
80 const NodeDef& node,
81 const std::unordered_map<string, const NodeDef*>& name_to_node,
82 OpInfo* op_info) {
83 OpRegistry* op_registry = OpRegistry::Global();
84 const OpDef* op_def = nullptr;
85 auto s = op_registry->LookUpOpDef(node.op(), &op_def);
86 if (!s.ok()) {
87 op_def = nullptr;
88 }
89
90 for (int i = 0; i < node.input_size(); ++i) {
91 const string input_name = node.input(i);
92 CHECK(!input_name.empty());
93 if (IsControlInput(input_name)) {
94 continue;
95 }
96 TensorId input_tensor_id = ParseTensorName(input_name);
97 const string input_node_name(input_tensor_id.first);
98
99 auto iter = name_to_node.find(input_node_name);
100 if (iter == name_to_node.end()) continue;
101 const NodeDef* input_node = iter->second;
102
103 if (i >= op_info->inputs_size()) {
104 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
105 << op_info->DebugString()
106 << "\nCurrent node: " << node.DebugString()
107 << "\nInput node: " << input_node->DebugString();
108 }
109
110 // The value attribute in Const input is useful for cost prediction.
111 if (input_node->op() == "Const" && i < op_info->inputs_size()) {
112 auto it = input_node->attr().find("value");
113 if (it == input_node->attr().end()) continue;
114
115 const AttrValue& attr_value = it->second;
116 std::vector<TensorProto> tensors = ExtractTensors(attr_value);
117 if (tensors.empty()) continue;
118
119 const TensorProto& t = tensors[0];
120 OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
121 *(input->mutable_value()) = t;
122
123 // For filename input, the file size can also be useful.
124 if (op_def && i < op_def->input_arg_size() &&
125 op_def->input_arg(i).name().find("filename") != string::npos) {
126 Tensor tensor;
127 if (!tensor.FromProto(t)) {
128 continue;
129 }
130 if (tensor.NumElements() != 1) {
131 continue;
132 }
133 const string filename = tensor.scalar<string>()();
134
135 Env* env = Env::Default();
136 FileStatistics stat;
137 Status s = env->Stat(filename, &stat);
138 if (!s.ok()) {
139 continue;
140 }
141 AttrValue attr;
142 attr.set_i(stat.length);
143 string attr_key = strings::StrCat("input_", i, "_filesize");
144 (*op_info->mutable_attr())[attr_key] = attr;
145 }
146 }
147
148 // When the input is a handle (e.g. look up table handle), the information
149 // in the op itself is not sufficient to predict the op memory.
150 if (op_def && i < op_def->input_arg_size() &&
151 op_def->input_arg(i).name().find("handle") != string::npos) {
152 string new_key = strings::StrCat("parent_", i, "_op");
153 AttrValue attr;
154 attr.set_s(input_node->op());
155 (*op_info->mutable_attr())[new_key] = attr;
156 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
157 // and attributes when necessary.
158 }
159 }
160 }
161
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)162 std::vector<OpInfo::TensorProperties> FindInputFeatures(
163 const NodeDef& node,
164 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
165 const std::unordered_map<string, const NodeDef*>& name_to_node) {
166 std::vector<OpInfo::TensorProperties> inputs;
167 for (const auto& input_name : node.input()) {
168 CHECK(!input_name.empty());
169 TensorId input_tensor_id = ParseTensorName(input_name);
170 const string input_node_name(input_tensor_id.first);
171 const int output_index = input_tensor_id.second;
172
173 // Skip control inputs.
174 if (output_index == Graph::kControlSlot) {
175 continue;
176 }
177
178 auto it = name_to_cost.find(input_node_name);
179 if (it == name_to_cost.end() || output_index < 0) {
180 inputs.push_back(UnknownInput());
181 } else {
182 const CostGraphDef::Node* input_cost = it->second;
183 if (input_cost->output_info_size() == 0) {
184 inputs.push_back(UnknownInput());
185 } else {
186 const CostGraphDef::Node::OutputInfo& output =
187 input_cost->output_info(output_index);
188 OpInfo::TensorProperties input;
189 input.set_dtype(output.dtype());
190 *input.mutable_shape() = output.shape();
191 inputs.push_back(input);
192 }
193 }
194 }
195
196 return inputs;
197 }
198
CalculateTensorSize(const OpInfo::TensorProperties & prop)199 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
200 int64 size = DataTypeSize(BaseType(prop.dtype()));
201 TensorShapeProto shape = prop.shape();
202
203 // Can't infer the size if the rank is unknown. It has to be at least a
204 // scalar though.
205 if (shape.unknown_rank()) {
206 VLOG(2) << "CalculateTensorSize() -- unknown rank";
207 return size;
208 }
209
210 // If one of the dimensions is unknown statically, assume it's at least one.
211 for (int i = 0; i < shape.dim_size(); ++i) {
212 if (shape.dim(i).size() < 0) {
213 shape.mutable_dim(i)->set_size(1);
214 VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
215 }
216 }
217
218 int64 num_elems = TensorShape(shape).num_elements();
219 return num_elems * size;
220 }
221
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)222 int64 CalculateOutputSize(
223 const std::vector<OpInfo::TensorProperties>& output_properties,
224 const int port_num) {
225 if (port_num < 0) return 4; // 4B for control dependency.
226
227 if (port_num >= output_properties.size()) {
228 LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
229 << " >= output_properties.size(): " << output_properties.size();
230 return 0;
231 }
232
233 return CalculateTensorSize(output_properties[port_num]);
234 }
235
GetDeviceInfo(const string & device_str)236 DeviceProperties GetDeviceInfo(const string& device_str) {
237 DeviceProperties unknown;
238 unknown.set_type("UNKNOWN");
239
240 DeviceNameUtils::ParsedName parsed;
241 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
242 if (parsed.type == "GPU") {
243 TfGpuId tf_gpu_id(parsed.id);
244 PlatformGpuId platform_gpu_id;
245 Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
246 if (!s.ok()) {
247 // We are probably running simulation without linking cuda libraries.
248 platform_gpu_id = PlatformGpuId(parsed.id);
249 }
250 return GetLocalGPUInfo(platform_gpu_id);
251 } else if (parsed.type == "CPU") {
252 return GetLocalCPUInfo();
253 }
254 }
255 return unknown;
256 }
257
GetDeviceInfo(const CostGraphDef::Node & node)258 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
259 return GetDeviceInfo(node.device());
260 }
261
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)262 OpInfo BuildOpInfoWithoutDevice(
263 const NodeDef& node,
264 const std::unordered_map<string, const NodeDef*>& name_to_node,
265 const std::vector<OpInfo::TensorProperties>& inputs) {
266 OpInfo op_info;
267 op_info.set_op(node.op());
268 *op_info.mutable_attr() = node.attr();
269 for (auto& input : inputs) {
270 *op_info.add_inputs() = input;
271 }
272 ExtractExtraProperties(node, name_to_node, &op_info);
273 return op_info;
274 }
275
GetOpDescription(const OpInfo & op_info)276 string GetOpDescription(const OpInfo& op_info) {
277 string description = "[";
278 description += "Op=" + op_info.op() + ", ";
279 description += "input_shapes=[";
280 for (auto const& input : op_info.inputs()) {
281 description += PartialTensorShape::DebugString(input.shape());
282 }
283 description += "]";
284 return description;
285 }
286
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)287 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
288 const GraphDef& graph) {
289 OpPerformanceList ret;
290 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
291 std::unordered_map<string, const NodeDef*> name_to_node;
292 for (auto& node : cost_graph.node()) {
293 name_to_cost[node.name()] = &node;
294 }
295 for (auto& node : graph.node()) {
296 name_to_node[node.name()] = &node;
297 }
298
299 for (const auto& node : graph.node()) {
300 // Skip the nodes that are not in the cost graph: these are nodes that
301 // aren't run, because they aren't in the intersection of transitive
302 // fan-in of a fetch node and the transitive fan-out of an input, or nodes
303 // that were optimized away by the optimizer. Since they don't contribute
304 // to the execution time we simply discard them.
305 auto it = name_to_cost.find(node.name());
306 if (it == name_to_cost.end()) {
307 continue;
308 }
309 const CostGraphDef::Node* cost_node = it->second;
310
311 OpPerformance* perf = ret.add_op_performance();
312 perf->set_node(node.name());
313
314 std::vector<OpInfo::TensorProperties> inputs =
315 FindInputFeatures(node, name_to_cost, name_to_node);
316 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
317 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
318
319 perf->set_temporary_memory_size(cost_node->temporary_memory_size());
320 // Note that CostGraphDef::Node::compute_cost is microseconds, while
321 // OpPerformance.compute_cost is nanoseconds.
322 perf->set_compute_cost(cost_node->compute_cost() * 1000);
323 perf->set_compute_time(cost_node->compute_time() * 1000);
324 perf->set_memory_time(cost_node->memory_time() * 1000);
325
326 for (const auto& output_info : cost_node->output_info()) {
327 perf->mutable_op_memory()->add_output_memory(output_info.size());
328 }
329
330 perf->mutable_op_memory()->set_temp_memory(
331 cost_node->temporary_memory_size());
332 perf->mutable_op_memory()->set_persistent_memory(
333 cost_node->persistent_memory_size());
334 }
335 return ret;
336 }
337
Add(const uint64 value)338 void TensorSizeHistogram::Add(const uint64 value) {
339 num_elem_++;
340 sum_elem_ += value;
341 min_ = std::min(min_, value);
342 max_ = std::max(max_, value);
343 buckets_[Index(value)]++;
344 }
345
Merge(const TensorSizeHistogram & src)346 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
347 num_elem_ += src.num_elem_;
348 sum_elem_ += src.sum_elem_;
349 min_ = std::min(min_, src.min_);
350 max_ = std::max(max_, src.max_);
351 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
352 buckets_.begin(), std::plus<uint64>());
353 }
354
ToString() const355 string TensorSizeHistogram::ToString() const {
356 string r;
357 char buf[200];
358 snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_);
359 r.append(buf);
360 r.append(strings::HumanReadableNumBytes(Average()));
361 r.append(", Min: ");
362 r.append(strings::HumanReadableNumBytes(min_));
363 r.append(", Max: ");
364 r.append(strings::HumanReadableNumBytes(max_));
365 r.append("\n------------------------------------------------------\n");
366 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
367 uint64 cumul_sum = 0;
368
369 const int size_string_width = 12;
370 for (int i = 0; i < buckets_.size(); i++) {
371 if (buckets_[i] == 0) continue;
372 cumul_sum += buckets_[i];
373 r.append("[ ");
374 if (i == 0) {
375 r.append(size_string_width - 2, ' ');
376 r.append("0B");
377 } else {
378 uint64 left = 1ULL << (i - 1);
379 const auto left_string = strings::HumanReadableNumBytes(left);
380 r.append(size_string_width - left_string.size(), ' ');
381 r.append(left_string);
382 }
383 r.append(", ");
384 uint64 right = 1ULL << i;
385 const auto right_string = strings::HumanReadableNumBytes(right);
386 r.append(size_string_width - right_string.size(), ' ');
387 r.append(right_string);
388 snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ",
389 buckets_[i], // count
390 mult * buckets_[i], // percentage
391 mult * cumul_sum); // cum percentage
392 r.append(buf);
393
394 // Add hash marks based on percentage; 40 marks for 100%.
395 auto marks = static_cast<int>(
396 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
397 r.append(marks, '#');
398 r.push_back('\n');
399 }
400 return r;
401 }
402
Index(const uint64 value) const403 const int TensorSizeHistogram::Index(const uint64 value) const {
404 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
405 const auto index = Log2Floor64(value) + 1;
406 return std::min(index, kMaxBuckets - 1);
407 }
408
GetDeviceClassForNonChannelDevice(const string & device_name)409 string GetDeviceClassForNonChannelDevice(const string& device_name) {
410 DeviceNameUtils::ParsedName parsed_name;
411 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
412 if (!parsed) {
413 string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
414 name = str_util::StringReplace(name, "/replica_", "/replica:", true);
415 name = str_util::StringReplace(name, "/task_", "/task:", true);
416 name = str_util::StringReplace(name, "/device_", "/device:", true);
417 name = str_util::StringReplace(name, "GPU_", "GPU:", true);
418 name = str_util::StringReplace(name, "CPU_", "CPU:", true);
419 name = str_util::StringReplace(name, "gpu_", "gpu:", true);
420 name = str_util::StringReplace(name, "cpu_", "cpu:", true);
421 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
422 }
423 if (parsed) {
424 const string jobname = parsed_name.has_job ? parsed_name.job : "";
425 return strings::StrCat("/", jobname, "/", parsed_name.type);
426 } else {
427 return "Unclassified";
428 }
429 }
430
GetDeviceClass(const string & device_name)431 string GetDeviceClass(const string& device_name) {
432 // TODO(dyoon): channel device name follows the convention we currently have
433 // in VirtualScheduler. This should be revised with VirtualScheduler as well
434 // as VirtualPlacer in the future.
435 if (device_name.find("Channel") != string::npos) {
436 const string from = "_from_";
437 const string to = "_to_";
438 const auto from_loc = device_name.find(from);
439 const auto to_loc = device_name.find(to);
440 const auto src_device_full = device_name.substr(
441 from_loc + from.size(), to_loc - (from_loc + from.size()));
442 const auto dst_device_full = device_name.substr(to_loc + to.size());
443 return strings::StrCat(
444 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
445 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
446 } else {
447 return GetDeviceClassForNonChannelDevice(device_name);
448 }
449 }
450
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)451 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
452 bool verbosity) {
453 // TODO(dyoon): print out other stats as needed.
454 std::ostringstream output;
455
456 // Tensor size histogram:
457 // if verbosity, it outputs per-device histogram,
458 // otherwise, only per-class histogram.
459 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
460 const auto& step_stats = run_metadata.step_stats();
461 for (const auto& dev_stat : step_stats.dev_stats()) {
462 const auto& device_name = dev_stat.device();
463 auto& hist = device_to_hist_map[device_name];
464 for (const auto& node_stat : dev_stat.node_stats()) {
465 for (const auto& node_output : node_stat.output()) {
466 // TODO(dyoon): Calculate tensor size from tensor_description's dtype
467 // and shape, instead of using optional allocation_description.
468 const auto size = node_output.tensor_description()
469 .allocation_description()
470 .allocated_bytes();
471 hist.Add(size);
472 }
473 }
474 }
475 if (verbosity) {
476 output << "\n";
477 output << "Per device tensor size histogram.\n";
478 }
479
480 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
481 for (const auto& device_hist : device_to_hist_map) {
482 const auto& device_name = device_hist.first;
483 const auto& hist = device_hist.second;
484 if (verbosity) {
485 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
486 }
487 const auto device_class = GetDeviceClass(device_name);
488 auto it = device_class_to_hist_map.find(device_class);
489 if (it == device_class_to_hist_map.end()) {
490 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
491 } else {
492 it->second.Merge(hist);
493 }
494 }
495 output << "\n";
496 output << "Aggregated per device / channel type tensor size histogram:\n";
497 for (const auto& device_hist : device_class_to_hist_map) {
498 const auto& device_name = device_hist.first;
499 const auto& hist = device_hist.second;
500 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
501 }
502 output << "\n";
503
504 return output.str();
505 }
506
507 } // end namespace grappler
508 } // end namespace tensorflow
509