1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/distributed_runtime/remote_device.h"
17
18 #include <vector>
19
20 #include "tensorflow/core/common_runtime/device.h"
21 #include "tensorflow/core/common_runtime/process_util.h"
22 #include "tensorflow/core/distributed_runtime/worker_cache.h"
23 #include "tensorflow/core/distributed_runtime/worker_interface.h"
24 #include "tensorflow/core/lib/core/errors.h"
25 #include "tensorflow/core/lib/gtl/cleanup.h"
26 #include "tensorflow/core/platform/logging.h"
27 #include "tensorflow/core/platform/macros.h"
28 #include "tensorflow/core/protobuf/worker.pb.h"
29
30 namespace tensorflow {
31
32 // TODO(zhifengc): We need to consolidate (full/partial) device name
33 // parsing into one place.
34 //
35 // Parses and returns the local device part (e.g., cpu:0, gpu:4).
GetLocalDeviceName(StringPiece fullname)36 string GetLocalDeviceName(StringPiece fullname) {
37 auto pos = fullname.rfind('/');
38 CHECK_NE(pos, StringPiece::npos);
39 fullname.remove_prefix(pos + 1);
40 return string(fullname);
41 }
42
43 class RemoteDevice : public Device {
44 public:
RemoteDevice(Env * env,const DeviceAttributes & da)45 RemoteDevice(Env* env, const DeviceAttributes& da)
46 : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {}
47
Sync()48 Status Sync() override { return Status::OK(); }
GetAllocator(AllocatorAttributes attr)49 Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
50
51 private:
52 const string local_dev_name_;
53
54 TF_DISALLOW_COPY_AND_ASSIGN(RemoteDevice);
55 };
56
NewRemoteDevices(Env * env,WorkerCacheInterface * worker_cache,const string & worker_name,NewRemoteDevicesDone done)57 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
58 const string& worker_name, NewRemoteDevicesDone done) {
59 WorkerInterface* wi = worker_cache->CreateWorker(worker_name);
60 if (wi == nullptr) {
61 std::vector<Device*> empty;
62 done(errors::NotFound("Device ", worker_name, " is not found."), &empty);
63 return;
64 }
65 struct Call {
66 GetStatusRequest req;
67 GetStatusResponse resp;
68 };
69 Call* call = new Call;
70 auto cb = [env, worker_cache, worker_name, done, wi,
71 call](const Status& status) {
72 Status s = status;
73 std::vector<Device*> remote_devices;
74 auto cleanup = gtl::MakeCleanup(
75 [&worker_cache, &worker_name, &wi, &done, &remote_devices, &s, call] {
76 worker_cache->ReleaseWorker(worker_name, wi);
77 done(s, &remote_devices);
78 delete call;
79 });
80 if (s.ok()) {
81 DeviceNameUtils::ParsedName worker_name_parsed;
82 if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
83 !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
84 !worker_name_parsed.has_task) {
85 s = errors::InvalidArgument("Could not parse worker name: ",
86 worker_name);
87 LOG(WARNING) << s;
88 return;
89 }
90 remote_devices.reserve(call->resp.device_attributes_size());
91 for (const DeviceAttributes& da : call->resp.device_attributes()) {
92 DeviceNameUtils::ParsedName device_name_parsed;
93 CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
94 << "Device attribute name '" << da.name() << "' could not be "
95 << "parsed. Device Attribute: " << da.DebugString();
96 // Preserve the exact name, if possible.
97 // TODO(b/37868888): Simplify when legacy device name formats removed.
98 if (device_name_parsed.job == worker_name_parsed.job &&
99 device_name_parsed.replica == worker_name_parsed.replica &&
100 device_name_parsed.task == worker_name_parsed.task) {
101 auto d = new RemoteDevice(env, da);
102 remote_devices.push_back(d);
103 } else {
104 DeviceAttributes da_rewritten = da;
105 da_rewritten.set_name(DeviceNameUtils::FullName(
106 worker_name_parsed.job, worker_name_parsed.replica,
107 worker_name_parsed.task, device_name_parsed.type,
108 device_name_parsed.id));
109 auto d = new RemoteDevice(env, da_rewritten);
110 remote_devices.push_back(d);
111 }
112 }
113 }
114 };
115 wi->GetStatusAsync(&call->req, &call->resp, cb);
116 }
117
118 } // namespace tensorflow
119