1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/grappler/costs/utils.h"
17
18 #include <stddef.h>
19
20 #include <utility>
21
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48
49 namespace tensorflow {
50 namespace grappler {
51
UnknownInput()52 static OpInfo::TensorProperties UnknownInput() {
53 OpInfo::TensorProperties input;
54 input.set_dtype(DataType::DT_INVALID);
55 input.mutable_shape()->set_unknown_rank(true);
56 return input;
57 }
58
ExtractTensors(const AttrValue & attr_value)59 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
60 std::vector<TensorProto> tensors;
61 switch (attr_value.value_case()) {
62 case AttrValue::kTensor: {
63 tensors.push_back(attr_value.tensor());
64 break;
65 }
66 case AttrValue::kList: {
67 for (const auto& tensor_proto : attr_value.list().tensor()) {
68 tensors.push_back(tensor_proto);
69 }
70 break;
71 }
72 default: {
73 }
74 }
75 return tensors;
76 }
77
78 // Annotate the op_info inputs with extra information when possible (e.g. the
79 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)80 static void ExtractExtraProperties(
81 const NodeDef& node,
82 const std::unordered_map<string, const NodeDef*>& name_to_node,
83 OpInfo* op_info) {
84 OpRegistry* op_registry = OpRegistry::Global();
85 const OpDef* op_def = nullptr;
86 auto s = op_registry->LookUpOpDef(node.op(), &op_def);
87 if (!s.ok()) {
88 op_def = nullptr;
89 }
90
91 for (int i = 0; i < node.input_size(); ++i) {
92 const string input_name = node.input(i);
93 CHECK(!input_name.empty());
94 if (IsControlInput(input_name)) {
95 continue;
96 }
97 TensorId input_tensor_id = ParseTensorName(input_name);
98 const string input_node_name(input_tensor_id.first);
99
100 auto iter = name_to_node.find(input_node_name);
101 if (iter == name_to_node.end()) continue;
102 const NodeDef* input_node = iter->second;
103
104 if (i >= op_info->inputs_size()) {
105 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
106 << op_info->DebugString()
107 << "\nCurrent node: " << node.DebugString()
108 << "\nInput node: " << input_node->DebugString();
109 }
110
111 // The value attribute in Const input is useful for cost prediction.
112 if (input_node->op() == "Const" && i < op_info->inputs_size()) {
113 auto it = input_node->attr().find("value");
114 if (it == input_node->attr().end()) continue;
115
116 const AttrValue& attr_value = it->second;
117 std::vector<TensorProto> tensors = ExtractTensors(attr_value);
118 if (tensors.empty()) continue;
119
120 const TensorProto& t = tensors[0];
121 OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
122 *(input->mutable_value()) = t;
123
124 // For filename input, the file size can also be useful.
125 if (op_def && i < op_def->input_arg_size() &&
126 op_def->input_arg(i).name().find("filename") != string::npos) {
127 Tensor tensor;
128 if (!tensor.FromProto(t)) {
129 continue;
130 }
131 if (tensor.NumElements() != 1) {
132 continue;
133 }
134 const string& filename = tensor.scalar<tstring>()();
135
136 Env* env = Env::Default();
137 FileStatistics stat;
138 Status s = env->Stat(filename, &stat);
139 if (!s.ok()) {
140 continue;
141 }
142 AttrValue attr;
143 attr.set_i(stat.length);
144 string attr_key = absl::StrCat("input_", i, "_filesize");
145 (*op_info->mutable_attr())[attr_key] = attr;
146 }
147 }
148
149 // When the input is a handle (e.g. look up table handle), the information
150 // in the op itself is not sufficient to predict the op memory.
151 if (op_def && i < op_def->input_arg_size() &&
152 op_def->input_arg(i).name().find("handle") != string::npos) {
153 string new_key = absl::StrCat("parent_", i, "_op");
154 AttrValue attr;
155 attr.set_s(input_node->op());
156 (*op_info->mutable_attr())[new_key] = attr;
157 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
158 // and attributes when necessary.
159 }
160 }
161 }
162
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)163 std::vector<OpInfo::TensorProperties> FindInputFeatures(
164 const NodeDef& node,
165 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
166 const std::unordered_map<string, const NodeDef*>& name_to_node) {
167 std::vector<OpInfo::TensorProperties> inputs;
168 for (const auto& input_name : node.input()) {
169 CHECK(!input_name.empty());
170 TensorId input_tensor_id = ParseTensorName(input_name);
171 const string input_node_name(input_tensor_id.first);
172 const int output_index = input_tensor_id.second;
173
174 // Skip control inputs.
175 if (output_index == Graph::kControlSlot) {
176 continue;
177 }
178
179 auto it = name_to_cost.find(input_node_name);
180 if (it == name_to_cost.end() || output_index < 0) {
181 inputs.push_back(UnknownInput());
182 } else {
183 const CostGraphDef::Node* input_cost = it->second;
184 if (input_cost->output_info_size() == 0) {
185 inputs.push_back(UnknownInput());
186 } else {
187 const CostGraphDef::Node::OutputInfo& output =
188 input_cost->output_info(output_index);
189 OpInfo::TensorProperties input;
190 input.set_dtype(output.dtype());
191 *input.mutable_shape() = output.shape();
192 inputs.push_back(input);
193 }
194 }
195 }
196
197 return inputs;
198 }
199
CalculateTensorSize(const OpInfo::TensorProperties & prop)200 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
201 int64 size = DataTypeSize(BaseType(prop.dtype()));
202 TensorShapeProto shape = prop.shape();
203
204 // Can't infer the size if the rank is unknown. It has to be at least a
205 // scalar though.
206 if (shape.unknown_rank()) {
207 VLOG(2) << "CalculateTensorSize() -- unknown rank";
208 return size;
209 }
210
211 // If one of the dimensions is unknown statically, assume it's at least one.
212 for (int i = 0; i < shape.dim_size(); ++i) {
213 if (shape.dim(i).size() < 0) {
214 shape.mutable_dim(i)->set_size(1);
215 VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
216 }
217 }
218
219 int64 num_elems = TensorShape(shape).num_elements();
220 return num_elems * size;
221 }
222
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)223 int64 CalculateOutputSize(
224 const std::vector<OpInfo::TensorProperties>& output_properties,
225 const int port_num) {
226 if (port_num < 0) return 4; // 4B for control dependency.
227
228 if (port_num >= output_properties.size()) {
229 LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
230 << " >= output_properties.size(): " << output_properties.size();
231 return 0;
232 }
233
234 return CalculateTensorSize(output_properties[port_num]);
235 }
236
GetDeviceInfo(const string & device_str)237 DeviceProperties GetDeviceInfo(const string& device_str) {
238 DeviceProperties unknown;
239 unknown.set_type("UNKNOWN");
240
241 DeviceNameUtils::ParsedName parsed;
242 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
243 if (parsed.type == "GPU") {
244 TfGpuId tf_gpu_id(parsed.id);
245 PlatformGpuId platform_gpu_id;
246 Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
247 if (!s.ok()) {
248 // We are probably running simulation without linking cuda libraries.
249 platform_gpu_id = PlatformGpuId(parsed.id);
250 }
251 return GetLocalGPUInfo(platform_gpu_id);
252 } else if (parsed.type == "CPU") {
253 return GetLocalCPUInfo();
254 }
255 }
256 return unknown;
257 }
258
GetDeviceInfo(const CostGraphDef::Node & node)259 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
260 return GetDeviceInfo(node.device());
261 }
262
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)263 OpInfo BuildOpInfoWithoutDevice(
264 const NodeDef& node,
265 const std::unordered_map<string, const NodeDef*>& name_to_node,
266 const std::vector<OpInfo::TensorProperties>& inputs) {
267 OpInfo op_info;
268 op_info.set_op(node.op());
269 *op_info.mutable_attr() = node.attr();
270 for (auto& input : inputs) {
271 *op_info.add_inputs() = input;
272 }
273 ExtractExtraProperties(node, name_to_node, &op_info);
274 return op_info;
275 }
276
GetOpDescription(const OpInfo & op_info)277 string GetOpDescription(const OpInfo& op_info) {
278 string description = "[";
279 description += "Op=" + op_info.op() + ", ";
280 description += "input_shapes=[";
281 for (auto const& input : op_info.inputs()) {
282 description += PartialTensorShape::DebugString(input.shape());
283 }
284 description += "]";
285 return description;
286 }
287
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)288 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
289 const GraphDef& graph) {
290 OpPerformanceList ret;
291 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
292 std::unordered_map<string, const NodeDef*> name_to_node;
293 for (auto& node : cost_graph.node()) {
294 name_to_cost[node.name()] = &node;
295 }
296 for (auto& node : graph.node()) {
297 name_to_node[node.name()] = &node;
298 }
299
300 for (const auto& node : graph.node()) {
301 // Skip the nodes that are not in the cost graph: these are nodes that
302 // aren't run, because they aren't in the intersection of transitive
303 // fan-in of a fetch node and the transitive fan-out of an input, or nodes
304 // that were optimized away by the optimizer. Since they don't contribute
305 // to the execution time we simply discard them.
306 auto it = name_to_cost.find(node.name());
307 if (it == name_to_cost.end()) {
308 continue;
309 }
310 const CostGraphDef::Node* cost_node = it->second;
311
312 OpPerformance* perf = ret.add_op_performance();
313 perf->set_node(node.name());
314
315 std::vector<OpInfo::TensorProperties> inputs =
316 FindInputFeatures(node, name_to_cost, name_to_node);
317 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
318 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
319
320 perf->set_temporary_memory_size(cost_node->temporary_memory_size());
321 // Note that CostGraphDef::Node::compute_cost is microseconds, while
322 // OpPerformance.compute_cost is nanoseconds.
323 perf->set_compute_cost(cost_node->compute_cost() * 1000);
324 perf->set_compute_time(cost_node->compute_time() * 1000);
325 perf->set_memory_time(cost_node->memory_time() * 1000);
326
327 for (const auto& output_info : cost_node->output_info()) {
328 perf->mutable_op_memory()->add_output_memory(output_info.size());
329 }
330
331 perf->mutable_op_memory()->set_temp_memory(
332 cost_node->temporary_memory_size());
333 perf->mutable_op_memory()->set_persistent_memory(
334 cost_node->persistent_memory_size());
335 }
336 return ret;
337 }
338
Add(const uint64 value)339 void TensorSizeHistogram::Add(const uint64 value) {
340 num_elem_++;
341 sum_elem_ += value;
342 min_ = std::min(min_, value);
343 max_ = std::max(max_, value);
344 buckets_[Index(value)]++;
345 }
346
Merge(const TensorSizeHistogram & src)347 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
348 num_elem_ += src.num_elem_;
349 sum_elem_ += src.sum_elem_;
350 min_ = std::min(min_, src.min_);
351 max_ = std::max(max_, src.max_);
352 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
353 buckets_.begin(), std::plus<uint64>());
354 }
355
ToString() const356 string TensorSizeHistogram::ToString() const {
357 string r = absl::StrFormat(
358 "Count: %lld, Average: %s, Min: %s, Max: %s"
359 "\n------------------------------------------------------\n",
360 num_elem_, strings::HumanReadableNumBytes(Average()),
361 strings::HumanReadableNumBytes(min_),
362 strings::HumanReadableNumBytes(max_));
363 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
364 uint64 cumul_sum = 0;
365
366 for (int i = 0; i < buckets_.size(); i++) {
367 if (buckets_[i] == 0) continue;
368 cumul_sum += buckets_[i];
369 uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
370 uint64 right = 1ULL << i;
371 absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
372 strings::HumanReadableNumBytes(left),
373 strings::HumanReadableNumBytes(right),
374 buckets_[i], // count
375 mult * buckets_[i], // percentage
376 mult * cumul_sum); // cumulative percentage
377
378 // Add hash marks based on percentage; 40 marks for 100%.
379 auto marks = static_cast<int>(
380 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
381 absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
382 }
383 return r;
384 }
385
Index(const uint64 value) const386 const int TensorSizeHistogram::Index(const uint64 value) const {
387 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
388 const auto index = Log2Floor64(value) + 1;
389 return std::min(index, kMaxBuckets - 1);
390 }
391
GetDeviceClassForNonChannelDevice(const string & device_name)392 string GetDeviceClassForNonChannelDevice(const string& device_name) {
393 DeviceNameUtils::ParsedName parsed_name;
394 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
395 if (!parsed) {
396 string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
397 name = str_util::StringReplace(name, "/replica_", "/replica:", true);
398 name = str_util::StringReplace(name, "/task_", "/task:", true);
399 name = str_util::StringReplace(name, "/device_", "/device:", true);
400 name = str_util::StringReplace(name, "GPU_", "GPU:", true);
401 name = str_util::StringReplace(name, "CPU_", "CPU:", true);
402 name = str_util::StringReplace(name, "gpu_", "gpu:", true);
403 name = str_util::StringReplace(name, "cpu_", "cpu:", true);
404 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
405 }
406 if (parsed) {
407 const string jobname = parsed_name.has_job ? parsed_name.job : "";
408 return absl::StrCat("/", jobname, "/", parsed_name.type);
409 } else {
410 return "Unclassified";
411 }
412 }
413
GetDeviceClass(const string & device_name)414 string GetDeviceClass(const string& device_name) {
415 // TODO(dyoon): channel device name follows the convention we currently have
416 // in VirtualScheduler. This should be revised with VirtualScheduler as well
417 // as VirtualPlacer in the future.
418 if (device_name.find("Channel") != string::npos) {
419 const string from = "_from_";
420 const string to = "_to_";
421 const auto from_loc = device_name.find(from);
422 const auto to_loc = device_name.find(to);
423 const auto src_device_full = device_name.substr(
424 from_loc + from.size(), to_loc - (from_loc + from.size()));
425 const auto dst_device_full = device_name.substr(to_loc + to.size());
426 return absl::StrCat(
427 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
428 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
429 } else {
430 return GetDeviceClassForNonChannelDevice(device_name);
431 }
432 }
433
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)434 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
435 bool verbosity) {
436 // TODO(dyoon): print out other stats as needed.
437 std::ostringstream output;
438
439 // Tensor size histogram:
440 // if verbosity, it outputs per-device histogram,
441 // otherwise, only per-class histogram.
442 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
443 const auto& step_stats = run_metadata.step_stats();
444 for (const auto& dev_stat : step_stats.dev_stats()) {
445 const auto& device_name = dev_stat.device();
446 auto& hist = device_to_hist_map[device_name];
447 for (const auto& node_stat : dev_stat.node_stats()) {
448 for (const auto& node_output : node_stat.output()) {
449 // TODO(dyoon): Calculate tensor size from tensor_description's dtype
450 // and shape, instead of using optional allocation_description.
451 const auto size = node_output.tensor_description()
452 .allocation_description()
453 .allocated_bytes();
454 hist.Add(size);
455 }
456 }
457 }
458 if (verbosity) {
459 output << "\n";
460 output << "Per device tensor size histogram.\n";
461 }
462
463 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
464 for (const auto& device_hist : device_to_hist_map) {
465 const auto& device_name = device_hist.first;
466 const auto& hist = device_hist.second;
467 if (verbosity) {
468 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
469 }
470 const auto device_class = GetDeviceClass(device_name);
471 auto it = device_class_to_hist_map.find(device_class);
472 if (it == device_class_to_hist_map.end()) {
473 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
474 } else {
475 it->second.Merge(hist);
476 }
477 }
478 output << "\n";
479 output << "Aggregated per device / channel type tensor size histogram:\n";
480 for (const auto& device_hist : device_class_to_hist_map) {
481 const auto& device_name = device_hist.first;
482 const auto& hist = device_hist.second;
483 output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
484 }
485 output << "\n";
486
487 return output.str();
488 }
489
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)490 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
491 Costs* costs) {
492 if (compute_memory_overlap) {
493 costs->execution_time =
494 std::max(costs->intermediate_memory_time,
495 std::max(costs->compute_time, costs->memory_time));
496 } else {
497 costs->execution_time = costs->compute_time + costs->memory_time +
498 costs->intermediate_memory_time;
499 }
500 }
501 } // end namespace grappler
502 } // end namespace tensorflow
503