1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <iterator>
17 #include <memory>
18 #include <vector>
19
20 #include "tensorflow/core/common_runtime/device_mgr.h"
21 #include "tensorflow/core/common_runtime/local_device.h"
22 #include "tensorflow/core/framework/device_attributes.pb.h"
23 #include "tensorflow/core/lib/core/errors.h"
24 #include "tensorflow/core/platform/logging.h"
25 #include "tensorflow/core/util/device_name_utils.h"
26
27 namespace tensorflow {
28
DynamicDeviceMgr()29 DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
30
~DynamicDeviceMgr()31 DynamicDeviceMgr::~DynamicDeviceMgr() {
32 // Release resources ahead of destroying the device manager as the resource
33 // destructors (e.g. ~IteratorResource) assume devices still exist.
34 for (auto& pair : dynamic_devices_) {
35 pair.first->ClearResourceMgr();
36 }
37 }
38
ListDeviceAttributes(std::vector<DeviceAttributes> * devices) const39 void DynamicDeviceMgr::ListDeviceAttributes(
40 std::vector<DeviceAttributes>* devices) const {
41 tf_shared_lock l(devices_mu_);
42 devices->reserve(dynamic_devices_.size());
43 for (const auto& pair : dynamic_devices_) {
44 devices->emplace_back(pair.first->attributes());
45 }
46 }
47
ListDevices() const48 std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
49 tf_shared_lock l(devices_mu_);
50 std::vector<Device*> devices;
51 devices.reserve(dynamic_devices_.size());
52 for (const auto& pair : dynamic_devices_) {
53 devices.emplace_back(pair.first);
54 }
55 return devices;
56 }
57
DebugString() const58 string DynamicDeviceMgr::DebugString() const {
59 string out;
60 tf_shared_lock l(devices_mu_);
61 for (const auto& pair : dynamic_devices_) {
62 strings::StrAppend(&out, pair.first->name(), "\n");
63 }
64 return out;
65 }
66
DeviceMappingString() const67 string DynamicDeviceMgr::DeviceMappingString() const {
68 string out;
69 tf_shared_lock l(devices_mu_);
70 for (const auto& pair : dynamic_devices_) {
71 if (!pair.first->attributes().physical_device_desc().empty()) {
72 strings::StrAppend(&out, pair.first->name(), " -> ",
73 pair.first->attributes().physical_device_desc(), "\n");
74 }
75 }
76 return out;
77 }
78
LookupDevice(StringPiece name,Device ** device) const79 Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
80 tf_shared_lock l(devices_mu_);
81 auto iter = device_map_.find(string(name));
82 if (iter == device_map_.end()) {
83 std::vector<StringPiece> device_names;
84 for (auto&& itr : device_map_) {
85 device_names.push_back(itr.first);
86 }
87 VLOG(1) << "Unknown device: " << name
88 << " all devices: " << absl::StrJoin(device_names, ", ");
89 return errors::InvalidArgument(name, " unknown device.");
90 }
91 *device = iter->second;
92 return Status::OK();
93 }
94
ContainsDevice(int64 device_incarnation) const95 bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
96 tf_shared_lock l(devices_mu_);
97 return device_incarnation_set_.contains(device_incarnation);
98 }
99
ClearContainers(gtl::ArraySlice<string> containers) const100 void DynamicDeviceMgr::ClearContainers(
101 gtl::ArraySlice<string> containers) const {
102 Status s;
103 tf_shared_lock l(devices_mu_);
104 for (const auto& pair : dynamic_devices_) {
105 if (containers.empty()) {
106 s.Update(pair.first->resource_manager()->Cleanup(
107 pair.first->resource_manager()->default_container()));
108 } else {
109 for (const string& c : containers) {
110 s.Update(pair.first->resource_manager()->Cleanup(c));
111 }
112 }
113 if (!s.ok()) {
114 LOG(WARNING) << s;
115 }
116 }
117 }
118
NumDeviceType(const string & type) const119 int DynamicDeviceMgr::NumDeviceType(const string& type) const {
120 tf_shared_lock l(devices_mu_);
121 auto iter = device_type_counts_.find(type);
122 if (iter != device_type_counts_.end()) return iter->second;
123 return 0;
124 }
125
AddDevices(std::vector<std::unique_ptr<Device>> devices)126 Status DynamicDeviceMgr::AddDevices(
127 std::vector<std::unique_ptr<Device>> devices) {
128 mutex_lock l(devices_mu_);
129 for (auto& d : devices) {
130 if (device_map_.find(d->name()) != device_map_.end()) {
131 return errors::InvalidArgument(
132 "Trying to add device ", d->name(),
133 " to manager but its name conflicts with an existing device.");
134 }
135 // Register under the (1) full name and (2) canonical name.
136 for (const string& name :
137 DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
138 device_map_[name] = d.get();
139 }
140 // Register under the (3) local name and (4) legacy local name.
141 for (const string& name :
142 DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
143 device_map_[name] = d.get();
144 }
145 device_type_counts_[d->device_type()]++;
146 device_incarnation_set_.insert(d->attributes().incarnation());
147 dynamic_devices_.emplace(d.get(), std::move(d));
148 }
149 return Status::OK();
150 }
151
RemoveDevices(std::vector<Device * > devices)152 Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
153 mutex_lock l(devices_mu_);
154
155 for (const auto& d : devices) {
156 if (d == cpu_device_) {
157 TF_RETURN_IF_ERROR(
158 errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
159 }
160 auto it = dynamic_devices_.find(d);
161 if (it == dynamic_devices_.end()) {
162 TF_RETURN_IF_ERROR(errors::InvalidArgument("Unknown device ", d->name()));
163 }
164 }
165
166 for (const auto& d : devices) {
167 auto it = dynamic_devices_.find(d);
168
169 // Clear registration of (1) full name and (2) canonical name
170 for (const string& name :
171 DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
172 device_map_.erase(name);
173 }
174 // Clear registration of (3) local name and (4) legacy local name
175 for (const string& name :
176 DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
177 device_map_.erase(name);
178 }
179 device_type_counts_[d->device_type()]--;
180 device_incarnation_set_.erase(d->attributes().incarnation());
181 stale_devices_.add(std::move(it->second));
182 dynamic_devices_.erase(it);
183 }
184 return Status::OK();
185 }
186
RemoveDevicesByName(const std::vector<string> & device_names)187 Status DynamicDeviceMgr::RemoveDevicesByName(
188 const std::vector<string>& device_names) {
189 std::vector<Device*> devices_to_remove;
190 for (const string& name : device_names) {
191 Device* device;
192 TF_RETURN_IF_ERROR(LookupDevice(name, &device));
193 devices_to_remove.emplace_back(device);
194 }
195 return RemoveDevices(devices_to_remove);
196 }
197
HostCPU() const198 Device* DynamicDeviceMgr::HostCPU() const {
199 mutex_lock l(devices_mu_);
200 if (dynamic_devices_.find(cpu_device_) != dynamic_devices_.end()) {
201 return cpu_device_;
202 }
203 cpu_device_ = nullptr;
204 for (const auto& pair : dynamic_devices_) {
205 if (pair.first->device_type() == DEVICE_CPU &&
206 pair.first->parsed_name().id == 0) {
207 cpu_device_ = pair.first;
208 break;
209 }
210 }
211 return cpu_device_;
212 }
213
214 } // namespace tensorflow
215