• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/virtual_placer.h"
17 #include "tensorflow/core/framework/node_def.pb.h"
18 #include "tensorflow/core/grappler/clusters/cluster.h"
19 #include "tensorflow/core/grappler/devices.h"
20 #include "tensorflow/core/lib/strings/str_util.h"
21 #include "tensorflow/core/util/device_name_utils.h"
22 
23 namespace tensorflow {
24 namespace grappler {
25 
VirtualPlacer(const std::unordered_map<string,DeviceProperties> & devices)26 VirtualPlacer::VirtualPlacer(
27     const std::unordered_map<string, DeviceProperties>& devices)
28     : devices_(devices),
29       // Default job name for canonical device name. Needs to be set before the
30       // first call to to_lfqn_or_empty()
31       default_job_name_lowercase_("localhost") {
32   lfqn_map_.reserve(devices_.size());
33   for (const auto& kv : devices_) {
34     const auto lfqn = to_lfqn_or_empty(kv.first);
35     if (lfqn.empty()) {
36       LOG(ERROR) << "VirtualPlacer couldn't parse device name from cluster: "
37                  << kv.first;
38     } else {
39       lfqn_map_[lfqn] = kv.first;
40     }
41   }
42 
43   if (devices_.empty()) {
44     // If there are no devices in the cluster, add a single device, "UNKNOWN" to
45     // the cluster.
46     default_device_name_ = "UNKNOWN";
47     DeviceProperties& prop = devices_["UNKNOWN"];
48     prop.set_type("UNKNOWN");
49   } else if (devices_.size() == 1) {
50     // If there is only one device in the cluster, use it as default device,
51     // whatever it is.
52     default_device_name_ = devices_.begin()->first;
53   } else {
54     // Default device is set from the devices in the cluster in the following
55     // priority: /gpu:0, /cpu:0, or any device.
56     // TODO(dyoon): This logic assumes single machine with CPU and GPU devices.
57     // Make it more general to support multiple machines, job types, and devices
58     // other than CPU and GPU.
59     std::map<int, string> cpu_devices;  // CPU device map: id -> device name.
60     std::map<int, string> gpu_devices;  // GPU device map: id -> device name.
61     for (const auto& kv : lfqn_map_) {
62       const auto& lfqn = kv.first;
63       const auto& cluster_device_name = kv.second;
64       DeviceNameUtils::ParsedName parsed_name;
65       bool parsed = DeviceNameUtils::ParseFullName(lfqn, &parsed_name);
66       if (parsed) {
67         // Parsed devices are stored to cpu_devices or gpu_devices map,
68         // addressed (and ordered) by device id.
69         const auto type = absl::AsciiStrToLower(parsed_name.type);
70         if (type == "gpu") {
71           gpu_devices[parsed_name.id] = cluster_device_name;
72         } else if (type == "cpu") {
73           cpu_devices[parsed_name.id] = cluster_device_name;
74         }
75       }
76     }
77 
78     if (!gpu_devices.empty()) {
79       // GPU:0 (or GPU with smallest device id).
80       default_device_name_ = gpu_devices.begin()->second;
81     } else if (!cpu_devices.empty()) {
82       // CPU:0 (or CPU with smallest device id).
83       default_device_name_ = cpu_devices.begin()->second;
84     } else {
85       default_device_name_ = devices_.begin()->first;  // Any device.
86     }
87   }
88   VLOG(3) << "default device name: " << default_device_name_;
89 
90   // Scan the device names from the cluster, and if there is one job name used,
91   // use it for canonical device name.
92   std::unordered_set<string> job_names_from_cluster;
93   for (const auto& device : lfqn_map_) {
94     const auto& lfqn = device.first;
95     DeviceNameUtils::ParsedName parsed_name;
96     bool parsed = DeviceNameUtils::ParseFullName(lfqn, &parsed_name);
97     if (parsed && !parsed_name.job.empty()) {
98       job_names_from_cluster.insert(parsed_name.job);
99       if (job_names_from_cluster.size() > 1) {
100         break;
101       }
102     }
103   }
104   // If there is only one type of job name in all the devices in the cluster,
105   // use that one as default job name; otherwise, use localhost.
106   // TODO(dyoon): this should be improved, especially when the cluster is
107   // composed of multiple worker, PS, and other types of jobs.
108   if (job_names_from_cluster.size() == 1) {
109     auto it = job_names_from_cluster.begin();
110     default_job_name_lowercase_ = *it;
111   }
112   VLOG(3) << "default job name: " << default_job_name_lowercase_;
113 }
114 
get_device(const NodeDef & node) const115 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
116   string device = get_canonical_device_name(node);
117   VLOG(3) << "node.name=" << node.name() << " node.device=" << node.device()
118           << " is placed on: " << device;
119   auto it = devices_.find(device);
120   DCHECK(it != devices_.end());
121   return it->second;
122 }
123 
get_canonical_device_name(const NodeDef & node) const124 string VirtualPlacer::get_canonical_device_name(const NodeDef& node) const {
125   if (node.device().empty()) {
126     return default_device_name_;
127   }
128 
129   const auto lfqn = to_lfqn_or_empty(node.device());
130   if (lfqn.empty()) {
131     return default_device_name_;
132   }
133 
134   const auto it = lfqn_map_.find(lfqn);
135   if (it != lfqn_map_.end()) {
136     return it->second;
137   }
138 
139   return default_device_name_;
140 }
141 
to_lfqn_or_empty(const string & device_name) const142 string VirtualPlacer::to_lfqn_or_empty(const string& device_name) const {
143   DeviceNameUtils::ParsedName parsed_name;
144   const auto lowercase_name = absl::AsciiStrToLower(device_name);
145   bool parsed = DeviceNameUtils::ParseFullName(lowercase_name, &parsed_name);
146   if (!parsed) {
147     parsed = DeviceNameUtils::ParseLocalName(lowercase_name, &parsed_name);
148     parsed_name.job = "localhost";
149   }
150   if (!parsed) {
151     if (lowercase_name == "gpu" || lowercase_name == "cpu") {
152       parsed_name.job = "localhost";
153       parsed_name.type = lowercase_name;
154       parsed = true;
155     }
156   }
157   if (!parsed) {
158     return {};
159   }
160 
161   if (parsed_name.job.empty()) {
162     parsed_name.job = default_job_name_lowercase_;
163   }
164 
165   // Have to do this, because parser returns uppercase types for CPU and GPU.
166   parsed_name.type = absl::AsciiStrToLower(parsed_name.type);
167 
168   string lfqn = strings::StrCat(
169       "/job:", parsed_name.job, "/replica:", parsed_name.replica,
170       "/task:", parsed_name.task, "/device:", parsed_name.type, ":",
171       parsed_name.id);
172   return lfqn;
173 }
174 
175 }  // end namespace grappler
176 }  // end namespace tensorflow
177