1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/grappler/costs/utils.h"
17
18 #include <stddef.h>
19
20 #include <utility>
21
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48
49 namespace tensorflow {
50 namespace grappler {
51
UnknownInput()52 static OpInfo::TensorProperties UnknownInput() {
53 OpInfo::TensorProperties input;
54 input.set_dtype(DataType::DT_INVALID);
55 input.mutable_shape()->set_unknown_rank(true);
56 return input;
57 }
58
ExtractTensors(const AttrValue & attr_value)59 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
60 std::vector<TensorProto> tensors;
61 switch (attr_value.value_case()) {
62 case AttrValue::kTensor: {
63 tensors.push_back(attr_value.tensor());
64 break;
65 }
66 case AttrValue::kList: {
67 for (const auto& tensor_proto : attr_value.list().tensor()) {
68 tensors.push_back(tensor_proto);
69 }
70 break;
71 }
72 default: {
73 }
74 }
75 return tensors;
76 }
77
78 // Annotate the op_info inputs with extra information when possible (e.g. the
79 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)80 static void ExtractExtraProperties(
81 const NodeDef& node,
82 const std::unordered_map<string, const NodeDef*>& name_to_node,
83 OpInfo* op_info) {
84 OpRegistry* op_registry = OpRegistry::Global();
85 const OpDef* op_def = nullptr;
86 auto s = op_registry->LookUpOpDef(node.op(), &op_def);
87 if (!s.ok()) {
88 op_def = nullptr;
89 }
90
91 for (int i = 0; i < node.input_size(); ++i) {
92 const string input_name = node.input(i);
93 CHECK(!input_name.empty());
94 if (IsControlInput(input_name)) {
95 continue;
96 }
97 TensorId input_tensor_id = ParseTensorName(input_name);
98 const string input_node_name(input_tensor_id.first);
99
100 auto iter = name_to_node.find(input_node_name);
101 if (iter == name_to_node.end()) continue;
102 const NodeDef* input_node = iter->second;
103
104 if (i >= op_info->inputs_size()) {
105 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
106 << op_info->DebugString()
107 << "\nCurrent node: " << node.DebugString()
108 << "\nInput node: " << input_node->DebugString();
109 }
110
111 // The value attribute in Const input is useful for cost prediction.
112 if (input_node->op() == "Const" && i < op_info->inputs_size()) {
113 auto it = input_node->attr().find("value");
114 if (it == input_node->attr().end()) continue;
115
116 const AttrValue& attr_value = it->second;
117 std::vector<TensorProto> tensors = ExtractTensors(attr_value);
118 if (tensors.empty()) continue;
119
120 const TensorProto& t = tensors[0];
121 OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
122 *(input->mutable_value()) = t;
123
124 // For filename input, the file size can also be useful.
125 if (op_def && i < op_def->input_arg_size() &&
126 op_def->input_arg(i).name().find("filename") != string::npos) {
127 Tensor tensor;
128 if (!tensor.FromProto(t)) {
129 continue;
130 }
131 if (tensor.NumElements() != 1) {
132 continue;
133 }
134 const string& filename = tensor.scalar<tstring>()();
135
136 Env* env = Env::Default();
137 FileStatistics stat;
138 Status s = env->Stat(filename, &stat);
139 if (!s.ok()) {
140 continue;
141 }
142 AttrValue attr;
143 attr.set_i(stat.length);
144 string attr_key = absl::StrCat("input_", i, "_filesize");
145 (*op_info->mutable_attr())[attr_key] = attr;
146 }
147 }
148
149 // When the input is a handle (e.g. look up table handle), the information
150 // in the op itself is not sufficient to predict the op memory.
151 if (op_def && i < op_def->input_arg_size() &&
152 op_def->input_arg(i).name().find("handle") != string::npos) {
153 string new_key = absl::StrCat("parent_", i, "_op");
154 AttrValue attr;
155 attr.set_s(input_node->op());
156 (*op_info->mutable_attr())[new_key] = attr;
157 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
158 // and attributes when necessary.
159 }
160 }
161 }
162
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)163 std::vector<OpInfo::TensorProperties> FindInputFeatures(
164 const NodeDef& node,
165 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
166 const std::unordered_map<string, const NodeDef*>& name_to_node) {
167 std::vector<OpInfo::TensorProperties> inputs;
168 for (const auto& input_name : node.input()) {
169 CHECK(!input_name.empty());
170 TensorId input_tensor_id = ParseTensorName(input_name);
171 const string input_node_name(input_tensor_id.first);
172 const int output_index = input_tensor_id.second;
173
174 // Skip control inputs.
175 if (output_index == Graph::kControlSlot) {
176 continue;
177 }
178
179 auto it = name_to_cost.find(input_node_name);
180 if (it == name_to_cost.end() || output_index < 0) {
181 inputs.push_back(UnknownInput());
182 } else {
183 const CostGraphDef::Node* input_cost = it->second;
184 if (input_cost->output_info_size() == 0) {
185 inputs.push_back(UnknownInput());
186 } else {
187 const CostGraphDef::Node::OutputInfo& output =
188 input_cost->output_info(output_index);
189 OpInfo::TensorProperties input;
190 input.set_dtype(output.dtype());
191 *input.mutable_shape() = output.shape();
192 inputs.push_back(input);
193 }
194 }
195 }
196
197 return inputs;
198 }
199
CalculateTensorSize(const OpInfo::TensorProperties & prop)200 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
201 int64_t size = DataTypeSize(BaseType(prop.dtype()));
202 TensorShapeProto shape = prop.shape();
203
204 // Can't infer the size if the rank is unknown. It has to be at least a
205 // scalar though.
206 if (shape.unknown_rank()) {
207 VLOG(2) << "CalculateTensorSize() -- unknown rank";
208 return size;
209 }
210
211 // If one of the dimensions is unknown statically, assume it's at least one.
212 for (int i = 0; i < shape.dim_size(); ++i) {
213 if (shape.dim(i).size() < 0) {
214 shape.mutable_dim(i)->set_size(1);
215 VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
216 }
217 }
218
219 int64_t num_elems = TensorShape(shape).num_elements();
220 return num_elems * size;
221 }
222
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)223 int64 CalculateOutputSize(
224 const std::vector<OpInfo::TensorProperties>& output_properties,
225 const int port_num) {
226 if (port_num < 0) return 4; // 4B for control dependency.
227
228 if (port_num >= output_properties.size()) {
229 LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
230 << " >= output_properties.size(): " << output_properties.size();
231 return 0;
232 }
233
234 return CalculateTensorSize(output_properties[port_num]);
235 }
236
GetDeviceInfo(const string & device_str)237 DeviceProperties GetDeviceInfo(const string& device_str) {
238 DeviceProperties unknown;
239 unknown.set_type("UNKNOWN");
240
241 DeviceNameUtils::ParsedName parsed;
242 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
243 if (parsed.type == "GPU") {
244 TfDeviceId tf_device_id(parsed.id);
245 PlatformDeviceId platform_device_id;
246 Status s =
247 GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
248 if (!s.ok()) {
249 // We are probably running simulation without linking cuda libraries.
250 platform_device_id = PlatformDeviceId(parsed.id);
251 }
252 return GetLocalGPUInfo(platform_device_id);
253 } else if (parsed.type == "CPU") {
254 return GetLocalCPUInfo();
255 }
256 }
257 return unknown;
258 }
259
GetDeviceInfo(const CostGraphDef::Node & node)260 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
261 return GetDeviceInfo(node.device());
262 }
263
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)264 OpInfo BuildOpInfoWithoutDevice(
265 const NodeDef& node,
266 const std::unordered_map<string, const NodeDef*>& name_to_node,
267 const std::vector<OpInfo::TensorProperties>& inputs) {
268 OpInfo op_info;
269 op_info.set_op(node.op());
270 *op_info.mutable_attr() = node.attr();
271 for (auto& input : inputs) {
272 *op_info.add_inputs() = input;
273 }
274 ExtractExtraProperties(node, name_to_node, &op_info);
275 return op_info;
276 }
277
GetOpDescription(const OpInfo & op_info)278 string GetOpDescription(const OpInfo& op_info) {
279 string description = "[";
280 description += "Op=" + op_info.op() + ", ";
281 description += "input_shapes=[";
282 for (auto const& input : op_info.inputs()) {
283 description += PartialTensorShape::DebugString(input.shape());
284 }
285 description += "]";
286 return description;
287 }
288
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)289 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
290 const GraphDef& graph) {
291 OpPerformanceList ret;
292 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
293 std::unordered_map<string, const NodeDef*> name_to_node;
294 for (auto& node : cost_graph.node()) {
295 name_to_cost[node.name()] = &node;
296 }
297 for (auto& node : graph.node()) {
298 name_to_node[node.name()] = &node;
299 }
300
301 for (const auto& node : graph.node()) {
302 // Skip the nodes that are not in the cost graph: these are nodes that
303 // aren't run, because they aren't in the intersection of transitive
304 // fan-in of a fetch node and the transitive fan-out of an input, or nodes
305 // that were optimized away by the optimizer. Since they don't contribute
306 // to the execution time we simply discard them.
307 auto it = name_to_cost.find(node.name());
308 if (it == name_to_cost.end()) {
309 continue;
310 }
311 const CostGraphDef::Node* cost_node = it->second;
312
313 OpPerformance* perf = ret.add_op_performance();
314 perf->set_node(node.name());
315
316 std::vector<OpInfo::TensorProperties> inputs =
317 FindInputFeatures(node, name_to_cost, name_to_node);
318 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
319 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
320
321 perf->set_temporary_memory_size(cost_node->temporary_memory_size());
322 // Note that CostGraphDef::Node::compute_cost is microseconds, while
323 // OpPerformance.compute_cost is nanoseconds.
324 perf->set_compute_cost(cost_node->compute_cost() * 1000);
325 perf->set_compute_time(cost_node->compute_time() * 1000);
326 perf->set_memory_time(cost_node->memory_time() * 1000);
327
328 for (const auto& output_info : cost_node->output_info()) {
329 perf->mutable_op_memory()->add_output_memory(output_info.size());
330 }
331
332 perf->mutable_op_memory()->set_temp_memory(
333 cost_node->temporary_memory_size());
334 perf->mutable_op_memory()->set_persistent_memory(
335 cost_node->persistent_memory_size());
336 }
337 return ret;
338 }
339
Add(const uint64 value)340 void TensorSizeHistogram::Add(const uint64 value) {
341 num_elem_++;
342 sum_elem_ += value;
343 min_ = std::min(min_, value);
344 max_ = std::max(max_, value);
345 buckets_[Index(value)]++;
346 }
347
Merge(const TensorSizeHistogram & src)348 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
349 num_elem_ += src.num_elem_;
350 sum_elem_ += src.sum_elem_;
351 min_ = std::min(min_, src.min_);
352 max_ = std::max(max_, src.max_);
353 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
354 buckets_.begin(), std::plus<uint64>());
355 }
356
ToString() const357 string TensorSizeHistogram::ToString() const {
358 string r = absl::StrFormat(
359 "Count: %lld, Average: %s, Min: %s, Max: %s"
360 "\n------------------------------------------------------\n",
361 num_elem_, strings::HumanReadableNumBytes(Average()),
362 strings::HumanReadableNumBytes(min_),
363 strings::HumanReadableNumBytes(max_));
364 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
365 uint64 cumul_sum = 0;
366
367 for (int i = 0; i < buckets_.size(); i++) {
368 if (buckets_[i] == 0) continue;
369 cumul_sum += buckets_[i];
370 uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
371 uint64 right = 1ULL << i;
372 absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
373 strings::HumanReadableNumBytes(left),
374 strings::HumanReadableNumBytes(right),
375 buckets_[i], // count
376 mult * buckets_[i], // percentage
377 mult * cumul_sum); // cumulative percentage
378
379 // Add hash marks based on percentage; 40 marks for 100%.
380 auto marks = static_cast<int>(
381 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
382 absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
383 }
384 return r;
385 }
386
Index(const uint64 value) const387 const int TensorSizeHistogram::Index(const uint64 value) const {
388 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
389 const auto index = Log2Floor64(value) + 1;
390 return std::min(index, kMaxBuckets - 1);
391 }
392
GetDeviceClassForNonChannelDevice(const string & device_name)393 string GetDeviceClassForNonChannelDevice(const string& device_name) {
394 DeviceNameUtils::ParsedName parsed_name;
395 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
396 if (!parsed) {
397 string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
398 name = str_util::StringReplace(name, "/replica_", "/replica:", true);
399 name = str_util::StringReplace(name, "/task_", "/task:", true);
400 name = str_util::StringReplace(name, "/device_", "/device:", true);
401 name = str_util::StringReplace(name, "GPU_", "GPU:", true);
402 name = str_util::StringReplace(name, "CPU_", "CPU:", true);
403 name = str_util::StringReplace(name, "gpu_", "gpu:", true);
404 name = str_util::StringReplace(name, "cpu_", "cpu:", true);
405 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
406 }
407 if (parsed) {
408 const string jobname = parsed_name.has_job ? parsed_name.job : "";
409 return absl::StrCat("/", jobname, "/", parsed_name.type);
410 } else {
411 return "Unclassified";
412 }
413 }
414
GetDeviceClass(const string & device_name)415 string GetDeviceClass(const string& device_name) {
416 // TODO(dyoon): channel device name follows the convention we currently have
417 // in VirtualScheduler. This should be revised with VirtualScheduler as well
418 // as VirtualPlacer in the future.
419 if (device_name.find("Channel") != string::npos) {
420 const string from = "_from_";
421 const string to = "_to_";
422 const auto from_loc = device_name.find(from);
423 const auto to_loc = device_name.find(to);
424 const auto src_device_full = device_name.substr(
425 from_loc + from.size(), to_loc - (from_loc + from.size()));
426 const auto dst_device_full = device_name.substr(to_loc + to.size());
427 return absl::StrCat(
428 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
429 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
430 } else {
431 return GetDeviceClassForNonChannelDevice(device_name);
432 }
433 }
434
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)435 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
436 bool verbosity) {
437 // TODO(dyoon): print out other stats as needed.
438 std::ostringstream output;
439
440 // Tensor size histogram:
441 // if verbosity, it outputs per-device histogram,
442 // otherwise, only per-class histogram.
443 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
444 const auto& step_stats = run_metadata.step_stats();
445 for (const auto& dev_stat : step_stats.dev_stats()) {
446 const auto& device_name = dev_stat.device();
447 auto& hist = device_to_hist_map[device_name];
448 for (const auto& node_stat : dev_stat.node_stats()) {
449 for (const auto& node_output : node_stat.output()) {
450 // TODO(dyoon): Calculate tensor size from tensor_description's dtype
451 // and shape, instead of using optional allocation_description.
452 const auto size = node_output.tensor_description()
453 .allocation_description()
454 .allocated_bytes();
455 hist.Add(size);
456 }
457 }
458 }
459 if (verbosity) {
460 output << "\n";
461 output << "Per device tensor size histogram.\n";
462 }
463
464 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
465 for (const auto& device_hist : device_to_hist_map) {
466 const auto& device_name = device_hist.first;
467 const auto& hist = device_hist.second;
468 if (verbosity) {
469 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
470 }
471 const auto device_class = GetDeviceClass(device_name);
472 auto it = device_class_to_hist_map.find(device_class);
473 if (it == device_class_to_hist_map.end()) {
474 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
475 } else {
476 it->second.Merge(hist);
477 }
478 }
479 output << "\n";
480 output << "Aggregated per device / channel type tensor size histogram:\n";
481 for (const auto& device_hist : device_class_to_hist_map) {
482 const auto& device_name = device_hist.first;
483 const auto& hist = device_hist.second;
484 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
485 }
486 output << "\n";
487
488 return output.str();
489 }
490
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)491 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
492 Costs* costs) {
493 if (compute_memory_overlap) {
494 costs->execution_time =
495 std::max(costs->intermediate_memory_time,
496 std::max(costs->compute_time, costs->memory_time));
497 } else {
498 costs->execution_time = costs->compute_time + costs->memory_time +
499 costs->intermediate_memory_time;
500 }
501 }
502 } // end namespace grappler
503 } // end namespace tensorflow
504