• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/grappler/costs/virtual_placer.h"
17 #include "tensorflow/core/framework/node_def.pb.h"
18 #include "tensorflow/core/grappler/clusters/cluster.h"
19 #include "tensorflow/core/grappler/devices.h"
20 #include "tensorflow/core/lib/strings/str_util.h"
21 #include "tensorflow/core/util/device_name_utils.h"
22 
23 namespace tensorflow {
24 namespace grappler {
25 
VirtualPlacer(const Cluster * cluster)26 VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
27   CHECK(cluster);
28 
29   // Default job name for canonical device name. Needs to be set before the
30   // first call to to_lfqn_or_empty()
31   default_job_name_lowercase_ = "localhost";
32 
33   devices_ = cluster->GetDevices();
34   lfqn_map_.reserve(devices_.size());
35   for (const auto& kv : devices_) {
36     const auto lfqn = to_lfqn_or_empty(kv.first);
37     if (lfqn.empty()) {
38       LOG(ERROR) << "VirtualPlacer couldn't parse device name from cluster: "
39                  << kv.first;
40     } else {
41       lfqn_map_[lfqn] = kv.first;
42     }
43   }
44 
45   if (devices_.empty()) {
46     // If there are no devices in the cluster, add a single device, "UNKNOWN" to
47     // the cluster.
48     default_device_name_ = "UNKNOWN";
49     DeviceProperties& prop = devices_["UNKNOWN"];
50     prop.set_type("UNKNOWN");
51   } else if (devices_.size() == 1) {
52     // If there is only one device in the cluster, use it as default device,
53     // whatever it is.
54     default_device_name_ = devices_.begin()->first;
55   } else {
56     // Default device is set from the devices in the cluster in the following
57     // priority: /gpu:0, /cpu:0, or any device.
58     // TODO(dyoon): This logic assumes single machine with CPU and GPU devices.
59     // Make it more general to support multiple machines, job types, and devices
60     // other than CPU and GPU.
61     std::map<int, string> cpu_devices;  // CPU device map: id -> device name.
62     std::map<int, string> gpu_devices;  // GPU device map: id -> device name.
63     for (const auto& kv : lfqn_map_) {
64       const auto& lfqn = kv.first;
65       const auto& cluster_device_name = kv.second;
66       DeviceNameUtils::ParsedName parsed_name;
67       bool parsed = DeviceNameUtils::ParseFullName(lfqn, &parsed_name);
68       if (parsed) {
69         // Parsed devices are stored to cpu_devices or gpu_devices map,
70         // addressed (and ordered) by device id.
71         const auto type = str_util::Lowercase(parsed_name.type);
72         if (type == "gpu") {
73           gpu_devices[parsed_name.id] = cluster_device_name;
74         } else if (type == "cpu") {
75           cpu_devices[parsed_name.id] = cluster_device_name;
76         }
77       }
78     }
79 
80     if (!gpu_devices.empty()) {
81       // GPU:0 (or GPU with smallest device id).
82       default_device_name_ = gpu_devices.begin()->second;
83     } else if (!cpu_devices.empty()) {
84       // CPU:0 (or CPU with smallest device id).
85       default_device_name_ = cpu_devices.begin()->second;
86     } else {
87       default_device_name_ = devices_.begin()->first;  // Any device.
88     }
89   }
90   VLOG(3) << "default device name: " << default_device_name_;
91 
92   // Scan the device names from the cluster, and if there is one job name used,
93   // use it for canonical device name.
94   std::unordered_set<string> job_names_from_cluster;
95   for (const auto& device : lfqn_map_) {
96     const auto& lfqn = device.first;
97     DeviceNameUtils::ParsedName parsed_name;
98     bool parsed = DeviceNameUtils::ParseFullName(lfqn, &parsed_name);
99     if (parsed && !parsed_name.job.empty()) {
100       job_names_from_cluster.insert(parsed_name.job);
101       if (job_names_from_cluster.size() > 1) {
102         break;
103       }
104     }
105   }
106   // If there is only one type of job name in all the devices in the cluster,
107   // use that one as default job name; otherwise, use localhost.
108   // TODO(dyoon): this should be improved, especially when the cluster is
109   // composed of multiple worker, PS, and other types of jobs.
110   if (job_names_from_cluster.size() == 1) {
111     auto it = job_names_from_cluster.begin();
112     default_job_name_lowercase_ = *it;
113   }
114   VLOG(3) << "default job name: " << default_job_name_lowercase_;
115 }
116 
get_device(const NodeDef & node) const117 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
118   string device = get_canonical_device_name(node);
119   VLOG(3) << "node.name=" << node.name() << " node.device=" << node.device()
120           << " is placed on: " << device;
121   auto it = devices_.find(device);
122   DCHECK(it != devices_.end());
123   return it->second;
124 }
125 
get_canonical_device_name(const NodeDef & node) const126 string VirtualPlacer::get_canonical_device_name(const NodeDef& node) const {
127   if (node.device().empty()) {
128     return default_device_name_;
129   }
130 
131   const auto lfqn = to_lfqn_or_empty(node.device());
132   if (lfqn.empty()) {
133     return default_device_name_;
134   }
135 
136   const auto it = lfqn_map_.find(lfqn);
137   if (it != lfqn_map_.end()) {
138     return it->second;
139   }
140 
141   return default_device_name_;
142 }
143 
to_lfqn_or_empty(const string & device_name) const144 string VirtualPlacer::to_lfqn_or_empty(const string& device_name) const {
145   DeviceNameUtils::ParsedName parsed_name;
146   const auto lowercase_name = str_util::Lowercase(device_name);
147   bool parsed = DeviceNameUtils::ParseFullName(lowercase_name, &parsed_name);
148   if (!parsed) {
149     parsed = DeviceNameUtils::ParseLocalName(lowercase_name, &parsed_name);
150     parsed_name.job = "localhost";
151   }
152   if (!parsed) {
153     if (lowercase_name == "gpu" || lowercase_name == "cpu") {
154       parsed_name.job = "localhost";
155       parsed_name.type = lowercase_name;
156       parsed = true;
157     }
158   }
159   if (!parsed) {
160     return {};
161   }
162 
163   if (parsed_name.job.empty()) {
164     parsed_name.job = default_job_name_lowercase_;
165   }
166 
167   // Have to do this, because parser returns uppercase types for CPU and GPU.
168   parsed_name.type = str_util::Lowercase(parsed_name.type);
169 
170   string lfqn = strings::StrCat(
171       "/job:", parsed_name.job, "/replica:", parsed_name.replica,
172       "/task:", parsed_name.task, "/device:", parsed_name.type, ":",
173       parsed_name.id);
174   return lfqn;
175 }
176 
177 }  // end namespace grappler
178 }  // end namespace tensorflow
179