• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/utils.h"
17 
18 #include <stddef.h>
19 
20 #include <utility>
21 
22 #include "absl/strings/str_cat.h"
23 #include "absl/strings/str_format.h"
24 #include "third_party/eigen3/Eigen/Core"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/framework/allocation_description.pb.h"
28 #include "tensorflow/core/framework/attr_value.pb.h"
29 #include "tensorflow/core/framework/op.h"
30 #include "tensorflow/core/framework/op_def.pb.h"
31 #include "tensorflow/core/framework/step_stats.pb.h"
32 #include "tensorflow/core/framework/tensor.pb.h"
33 #include "tensorflow/core/framework/tensor_description.pb.h"
34 #include "tensorflow/core/framework/tensor_shape.pb.h"
35 #include "tensorflow/core/framework/types.pb.h"
36 #include "tensorflow/core/graph/graph.h"
37 #include "tensorflow/core/graph/tensor_id.h"
38 #include "tensorflow/core/grappler/clusters/utils.h"
39 #include "tensorflow/core/grappler/utils.h"
40 #include "tensorflow/core/lib/core/bits.h"
41 #include "tensorflow/core/lib/strings/numbers.h"
42 #include "tensorflow/core/platform/byte_order.h"
43 #include "tensorflow/core/platform/env.h"
44 #include "tensorflow/core/platform/logging.h"
45 #include "tensorflow/core/platform/protobuf.h"
46 #include "tensorflow/core/protobuf/config.pb.h"
47 #include "tensorflow/core/util/device_name_utils.h"
48 
49 namespace tensorflow {
50 namespace grappler {
51 
UnknownInput()52 static OpInfo::TensorProperties UnknownInput() {
53   OpInfo::TensorProperties input;
54   input.set_dtype(DataType::DT_INVALID);
55   input.mutable_shape()->set_unknown_rank(true);
56   return input;
57 }
58 
ExtractTensors(const AttrValue & attr_value)59 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
60   std::vector<TensorProto> tensors;
61   switch (attr_value.value_case()) {
62     case AttrValue::kTensor: {
63       tensors.push_back(attr_value.tensor());
64       break;
65     }
66     case AttrValue::kList: {
67       for (const auto& tensor_proto : attr_value.list().tensor()) {
68         tensors.push_back(tensor_proto);
69       }
70       break;
71     }
72     default: {
73     }
74   }
75   return tensors;
76 }
77 
78 // Annotate the op_info inputs with extra information when possible (e.g. the
79 // input value if it's known statically).
ExtractExtraProperties(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,OpInfo * op_info)80 static void ExtractExtraProperties(
81     const NodeDef& node,
82     const std::unordered_map<string, const NodeDef*>& name_to_node,
83     OpInfo* op_info) {
84   OpRegistry* op_registry = OpRegistry::Global();
85   const OpDef* op_def = nullptr;
86   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
87   if (!s.ok()) {
88     op_def = nullptr;
89   }
90 
91   for (int i = 0; i < node.input_size(); ++i) {
92     const string input_name = node.input(i);
93     CHECK(!input_name.empty());
94     if (IsControlInput(input_name)) {
95       continue;
96     }
97     TensorId input_tensor_id = ParseTensorName(input_name);
98     const string input_node_name(input_tensor_id.first);
99 
100     auto iter = name_to_node.find(input_node_name);
101     if (iter == name_to_node.end()) continue;
102     const NodeDef* input_node = iter->second;
103 
104     if (i >= op_info->inputs_size()) {
105       LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
106                  << op_info->DebugString()
107                  << "\nCurrent node: " << node.DebugString()
108                  << "\nInput node: " << input_node->DebugString();
109     }
110 
111     // The value attribute in Const input is useful for cost prediction.
112     if (input_node->op() == "Const" && i < op_info->inputs_size()) {
113       auto it = input_node->attr().find("value");
114       if (it == input_node->attr().end()) continue;
115 
116       const AttrValue& attr_value = it->second;
117       std::vector<TensorProto> tensors = ExtractTensors(attr_value);
118       if (tensors.empty()) continue;
119 
120       const TensorProto& t = tensors[0];
121       OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
122       *(input->mutable_value()) = t;
123 
124       // For filename input, the file size can also be useful.
125       if (op_def && i < op_def->input_arg_size() &&
126           op_def->input_arg(i).name().find("filename") != string::npos) {
127         Tensor tensor;
128         if (!tensor.FromProto(t)) {
129           continue;
130         }
131         if (tensor.NumElements() != 1) {
132           continue;
133         }
134         const string& filename = tensor.scalar<tstring>()();
135 
136         Env* env = Env::Default();
137         FileStatistics stat;
138         Status s = env->Stat(filename, &stat);
139         if (!s.ok()) {
140           continue;
141         }
142         AttrValue attr;
143         attr.set_i(stat.length);
144         string attr_key = absl::StrCat("input_", i, "_filesize");
145         (*op_info->mutable_attr())[attr_key] = attr;
146       }
147     }
148 
149     // When the input is a handle (e.g. look up table handle), the information
150     // in the op itself is not sufficient to predict the op memory.
151     if (op_def && i < op_def->input_arg_size() &&
152         op_def->input_arg(i).name().find("handle") != string::npos) {
153       string new_key = absl::StrCat("parent_", i, "_op");
154       AttrValue attr;
155       attr.set_s(input_node->op());
156       (*op_info->mutable_attr())[new_key] = attr;
157       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
158       // and attributes when necessary.
159     }
160   }
161 }
162 
FindInputFeatures(const NodeDef & node,const std::unordered_map<string,const CostGraphDef::Node * > & name_to_cost,const std::unordered_map<string,const NodeDef * > & name_to_node)163 std::vector<OpInfo::TensorProperties> FindInputFeatures(
164     const NodeDef& node,
165     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
166     const std::unordered_map<string, const NodeDef*>& name_to_node) {
167   std::vector<OpInfo::TensorProperties> inputs;
168   for (const auto& input_name : node.input()) {
169     CHECK(!input_name.empty());
170     TensorId input_tensor_id = ParseTensorName(input_name);
171     const string input_node_name(input_tensor_id.first);
172     const int output_index = input_tensor_id.second;
173 
174     // Skip control inputs.
175     if (output_index == Graph::kControlSlot) {
176       continue;
177     }
178 
179     auto it = name_to_cost.find(input_node_name);
180     if (it == name_to_cost.end() || output_index < 0) {
181       inputs.push_back(UnknownInput());
182     } else {
183       const CostGraphDef::Node* input_cost = it->second;
184       if (input_cost->output_info_size() == 0) {
185         inputs.push_back(UnknownInput());
186       } else {
187         const CostGraphDef::Node::OutputInfo& output =
188             input_cost->output_info(output_index);
189         OpInfo::TensorProperties input;
190         input.set_dtype(output.dtype());
191         *input.mutable_shape() = output.shape();
192         inputs.push_back(input);
193       }
194     }
195   }
196 
197   return inputs;
198 }
199 
CalculateTensorSize(const OpInfo::TensorProperties & prop)200 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
201   int64_t size = DataTypeSize(BaseType(prop.dtype()));
202   TensorShapeProto shape = prop.shape();
203 
204   // Can't infer the size if the rank is unknown. It has to be at least a
205   // scalar though.
206   if (shape.unknown_rank()) {
207     VLOG(2) << "CalculateTensorSize() -- unknown rank";
208     return size;
209   }
210 
211   // If one of the dimensions is unknown statically, assume it's at least one.
212   for (int i = 0; i < shape.dim_size(); ++i) {
213     if (shape.dim(i).size() < 0) {
214       shape.mutable_dim(i)->set_size(1);
215       VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
216     }
217   }
218 
219   int64_t num_elems = TensorShape(shape).num_elements();
220   return num_elems * size;
221 }
222 
CalculateOutputSize(const std::vector<OpInfo::TensorProperties> & output_properties,const int port_num)223 int64 CalculateOutputSize(
224     const std::vector<OpInfo::TensorProperties>& output_properties,
225     const int port_num) {
226   if (port_num < 0) return 4;  // 4B for control dependency.
227 
228   if (port_num >= output_properties.size()) {
229     LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
230                << " >= output_properties.size(): " << output_properties.size();
231     return 0;
232   }
233 
234   return CalculateTensorSize(output_properties[port_num]);
235 }
236 
GetDeviceInfo(const string & device_str)237 DeviceProperties GetDeviceInfo(const string& device_str) {
238   DeviceProperties unknown;
239   unknown.set_type("UNKNOWN");
240 
241   DeviceNameUtils::ParsedName parsed;
242   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
243     if (parsed.type == "GPU") {
244       TfDeviceId tf_device_id(parsed.id);
245       PlatformDeviceId platform_device_id;
246       Status s =
247           GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
248       if (!s.ok()) {
249         // We are probably running simulation without linking cuda libraries.
250         platform_device_id = PlatformDeviceId(parsed.id);
251       }
252       return GetLocalGPUInfo(platform_device_id);
253     } else if (parsed.type == "CPU") {
254       return GetLocalCPUInfo();
255     }
256   }
257   return unknown;
258 }
259 
GetDeviceInfo(const CostGraphDef::Node & node)260 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
261   return GetDeviceInfo(node.device());
262 }
263 
BuildOpInfoWithoutDevice(const NodeDef & node,const std::unordered_map<string,const NodeDef * > & name_to_node,const std::vector<OpInfo::TensorProperties> & inputs)264 OpInfo BuildOpInfoWithoutDevice(
265     const NodeDef& node,
266     const std::unordered_map<string, const NodeDef*>& name_to_node,
267     const std::vector<OpInfo::TensorProperties>& inputs) {
268   OpInfo op_info;
269   op_info.set_op(node.op());
270   *op_info.mutable_attr() = node.attr();
271   for (auto& input : inputs) {
272     *op_info.add_inputs() = input;
273   }
274   ExtractExtraProperties(node, name_to_node, &op_info);
275   return op_info;
276 }
277 
GetOpDescription(const OpInfo & op_info)278 string GetOpDescription(const OpInfo& op_info) {
279   string description = "[";
280   description += "Op=" + op_info.op() + ", ";
281   description += "input_shapes=[";
282   for (auto const& input : op_info.inputs()) {
283     description += PartialTensorShape::DebugString(input.shape());
284   }
285   description += "]";
286   return description;
287 }
288 
CostGraphToOpPerformanceData(const CostGraphDef & cost_graph,const GraphDef & graph)289 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
290                                                const GraphDef& graph) {
291   OpPerformanceList ret;
292   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
293   std::unordered_map<string, const NodeDef*> name_to_node;
294   for (auto& node : cost_graph.node()) {
295     name_to_cost[node.name()] = &node;
296   }
297   for (auto& node : graph.node()) {
298     name_to_node[node.name()] = &node;
299   }
300 
301   for (const auto& node : graph.node()) {
302     // Skip the nodes that are not in the cost graph: these are nodes that
303     // aren't run, because they aren't in the intersection of transitive
304     // fan-in of a fetch node and the transitive fan-out of an input, or nodes
305     // that were optimized away by the optimizer. Since they don't contribute
306     // to the execution time we simply discard them.
307     auto it = name_to_cost.find(node.name());
308     if (it == name_to_cost.end()) {
309       continue;
310     }
311     const CostGraphDef::Node* cost_node = it->second;
312 
313     OpPerformance* perf = ret.add_op_performance();
314     perf->set_node(node.name());
315 
316     std::vector<OpInfo::TensorProperties> inputs =
317         FindInputFeatures(node, name_to_cost, name_to_node);
318     *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
319     *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
320 
321     perf->set_temporary_memory_size(cost_node->temporary_memory_size());
322     // Note that CostGraphDef::Node::compute_cost is microseconds, while
323     // OpPerformance.compute_cost is nanoseconds.
324     perf->set_compute_cost(cost_node->compute_cost() * 1000);
325     perf->set_compute_time(cost_node->compute_time() * 1000);
326     perf->set_memory_time(cost_node->memory_time() * 1000);
327 
328     for (const auto& output_info : cost_node->output_info()) {
329       perf->mutable_op_memory()->add_output_memory(output_info.size());
330     }
331 
332     perf->mutable_op_memory()->set_temp_memory(
333         cost_node->temporary_memory_size());
334     perf->mutable_op_memory()->set_persistent_memory(
335         cost_node->persistent_memory_size());
336   }
337   return ret;
338 }
339 
Add(const uint64 value)340 void TensorSizeHistogram::Add(const uint64 value) {
341   num_elem_++;
342   sum_elem_ += value;
343   min_ = std::min(min_, value);
344   max_ = std::max(max_, value);
345   buckets_[Index(value)]++;
346 }
347 
Merge(const TensorSizeHistogram & src)348 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
349   num_elem_ += src.num_elem_;
350   sum_elem_ += src.sum_elem_;
351   min_ = std::min(min_, src.min_);
352   max_ = std::max(max_, src.max_);
353   std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
354                  buckets_.begin(), std::plus<uint64>());
355 }
356 
ToString() const357 string TensorSizeHistogram::ToString() const {
358   string r = absl::StrFormat(
359       "Count: %lld, Average: %s, Min: %s, Max: %s"
360       "\n------------------------------------------------------\n",
361       num_elem_, strings::HumanReadableNumBytes(Average()),
362       strings::HumanReadableNumBytes(min_),
363       strings::HumanReadableNumBytes(max_));
364   const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
365   uint64 cumul_sum = 0;
366 
367   for (int i = 0; i < buckets_.size(); i++) {
368     if (buckets_[i] == 0) continue;
369     cumul_sum += buckets_[i];
370     uint64 left = i == 0 ? 0ULL : 1ULL << (i - 1);
371     uint64 right = 1ULL << i;
372     absl::StrAppendFormat(&r, "[ %12s, %12s) %7d %7.3f%% %7.3f%% ",
373                           strings::HumanReadableNumBytes(left),
374                           strings::HumanReadableNumBytes(right),
375                           buckets_[i],         // count
376                           mult * buckets_[i],  // percentage
377                           mult * cumul_sum);   // cumulative percentage
378 
379     // Add hash marks based on percentage; 40 marks for 100%.
380     auto marks = static_cast<int>(
381         (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
382     absl::StrAppendFormat(&r, "%s\n", std::string(marks, '#'));
383   }
384   return r;
385 }
386 
Index(const uint64 value) const387 const int TensorSizeHistogram::Index(const uint64 value) const {
388   // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
389   const auto index = Log2Floor64(value) + 1;
390   return std::min(index, kMaxBuckets - 1);
391 }
392 
GetDeviceClassForNonChannelDevice(const string & device_name)393 string GetDeviceClassForNonChannelDevice(const string& device_name) {
394   DeviceNameUtils::ParsedName parsed_name;
395   bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
396   if (!parsed) {
397     string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
398     name = str_util::StringReplace(name, "/replica_", "/replica:", true);
399     name = str_util::StringReplace(name, "/task_", "/task:", true);
400     name = str_util::StringReplace(name, "/device_", "/device:", true);
401     name = str_util::StringReplace(name, "GPU_", "GPU:", true);
402     name = str_util::StringReplace(name, "CPU_", "CPU:", true);
403     name = str_util::StringReplace(name, "gpu_", "gpu:", true);
404     name = str_util::StringReplace(name, "cpu_", "cpu:", true);
405     parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
406   }
407   if (parsed) {
408     const string jobname = parsed_name.has_job ? parsed_name.job : "";
409     return absl::StrCat("/", jobname, "/", parsed_name.type);
410   } else {
411     return "Unclassified";
412   }
413 }
414 
GetDeviceClass(const string & device_name)415 string GetDeviceClass(const string& device_name) {
416   // TODO(dyoon): channel device name follows the convention we currently have
417   // in VirtualScheduler. This should be revised with VirtualScheduler as well
418   // as VirtualPlacer in the future.
419   if (device_name.find("Channel") != string::npos) {
420     const string from = "_from_";
421     const string to = "_to_";
422     const auto from_loc = device_name.find(from);
423     const auto to_loc = device_name.find(to);
424     const auto src_device_full = device_name.substr(
425         from_loc + from.size(), to_loc - (from_loc + from.size()));
426     const auto dst_device_full = device_name.substr(to_loc + to.size());
427     return absl::StrCat(
428         "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
429         " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
430   } else {
431     return GetDeviceClassForNonChannelDevice(device_name);
432   }
433 }
434 
GetStatsStringFromRunMetadata(const RunMetadata & run_metadata,bool verbosity)435 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
436                                      bool verbosity) {
437   // TODO(dyoon): print out other stats as needed.
438   std::ostringstream output;
439 
440   // Tensor size histogram:
441   // if verbosity, it outputs per-device histogram,
442   // otherwise, only per-class histogram.
443   std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
444   const auto& step_stats = run_metadata.step_stats();
445   for (const auto& dev_stat : step_stats.dev_stats()) {
446     const auto& device_name = dev_stat.device();
447     auto& hist = device_to_hist_map[device_name];
448     for (const auto& node_stat : dev_stat.node_stats()) {
449       for (const auto& node_output : node_stat.output()) {
450         // TODO(dyoon): Calculate tensor size from tensor_description's dtype
451         // and shape, instead of using optional allocation_description.
452         const auto size = node_output.tensor_description()
453                               .allocation_description()
454                               .allocated_bytes();
455         hist.Add(size);
456       }
457     }
458   }
459   if (verbosity) {
460     output << "\n";
461     output << "Per device tensor size histogram.\n";
462   }
463 
464   std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
465   for (const auto& device_hist : device_to_hist_map) {
466     const auto& device_name = device_hist.first;
467     const auto& hist = device_hist.second;
468     if (verbosity) {
469       output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
470     }
471     const auto device_class = GetDeviceClass(device_name);
472     auto it = device_class_to_hist_map.find(device_class);
473     if (it == device_class_to_hist_map.end()) {
474       device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
475     } else {
476       it->second.Merge(hist);
477     }
478   }
479   output << "\n";
480   output << "Aggregated per device / channel type tensor size histogram:\n";
481   for (const auto& device_hist : device_class_to_hist_map) {
482     const auto& device_name = device_hist.first;
483     const auto& hist = device_hist.second;
484     output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
485   }
486   output << "\n";
487 
488   return output.str();
489 }
490 
CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,Costs * costs)491 void CombineCostsAndUpdateExecutionTime(bool compute_memory_overlap,
492                                         Costs* costs) {
493   if (compute_memory_overlap) {
494     costs->execution_time =
495         std::max(costs->intermediate_memory_time,
496                  std::max(costs->compute_time, costs->memory_time));
497   } else {
498     costs->execution_time = costs->compute_time + costs->memory_time +
499                             costs->intermediate_memory_time;
500   }
501 }
502 }  // end namespace grappler
503 }  // end namespace tensorflow
504