1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/distributed_runtime/session_mgr.h"
17
18 #include <utility>
19
20 #include "tensorflow/core/common_runtime/device_mgr.h"
21 #include "tensorflow/core/common_runtime/renamed_device.h"
22 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
23 #include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
24 #include "tensorflow/core/lib/strings/strcat.h"
25 #include "tensorflow/core/protobuf/cluster.pb.h"
26 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
27 #include "tensorflow/core/util/ptr_util.h"
28
29 namespace tensorflow {
30
SessionMgr(WorkerEnv * worker_env,const string & default_worker_name,std::unique_ptr<WorkerCacheInterface> default_worker_cache,WorkerCacheFactory worker_cache_factory)31 SessionMgr::SessionMgr(
32 WorkerEnv* worker_env, const string& default_worker_name,
33 std::unique_ptr<WorkerCacheInterface> default_worker_cache,
34 WorkerCacheFactory worker_cache_factory)
35 : worker_env_(worker_env),
36 default_worker_cache_(std::move(default_worker_cache)),
37 legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
38 "", default_worker_name,
39 std::unique_ptr<WorkerCacheInterface>(
40 new WorkerCacheWrapper(default_worker_cache_.get())),
41 worker_env->device_mgr,
42 std::unique_ptr<GraphMgr>(
43 new GraphMgr(worker_env, worker_env->device_mgr)))),
44 worker_cache_factory_(std::move(worker_cache_factory)) {}
45
46 /* static */
WorkerNameFromServerDef(const ServerDef & server_def)47 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
48 return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
49 server_def.task_index());
50 }
51
CreateSession(const string & session,const ServerDef & server_def,bool isolate_session_state)52 Status SessionMgr::CreateSession(const string& session,
53 const ServerDef& server_def,
54 bool isolate_session_state) {
55 mutex_lock l(mu_);
56 if (session.empty()) {
57 return errors::InvalidArgument("Session must be non-empty.");
58 }
59
60 WorkerCacheInterface* worker_cache = nullptr;
61 string worker_name;
62 if (server_def.cluster().job().empty()) {
63 worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
64 worker_name = legacy_session_->worker_name;
65 } else {
66 TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
67 worker_name = WorkerNameFromServerDef(server_def);
68 }
69
70 if (worker_cache != nullptr && default_worker_cache_ != nullptr) {
71 worker_cache->SetLogging(this->is_logging_active_);
72 }
73
74 CHECK(!worker_env_->local_devices.empty())
75 << "The WorkerEnv must have at least one device in `local_devices`.";
76
77 std::shared_ptr<WorkerSession> worker_session;
78
79 if (isolate_session_state) {
80 // Create a private copy of the DeviceMgr for the WorkerSession.
81 std::vector<std::unique_ptr<Device>> renamed_devices;
82 for (Device* d : worker_env_->local_devices) {
83 renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
84 worker_name, d, false, isolate_session_state));
85 }
86
87 auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
88 auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
89 worker_session.reset(
90 new WorkerSession(session, worker_name,
91 std::unique_ptr<WorkerCacheInterface>(worker_cache),
92 std::move(device_mgr), std::move(graph_mgr)));
93 } else {
94 // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
95 // that resources using it can use its devices after the
96 // WorkerSession has been deleted.
97 auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
98 worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
99 session, worker_name,
100 std::unique_ptr<WorkerCacheInterface>(worker_cache),
101 worker_env_->device_mgr, std::move(graph_mgr));
102 }
103
104 sessions_.insert(std::make_pair(session, std::move(worker_session)));
105 return Status::OK();
106 }
107
DeleteSession(const string & session)108 Status SessionMgr::DeleteSession(const string& session) {
109 mutex_lock l(mu_);
110 auto it = sessions_.find(session);
111 if (it != sessions_.end()) {
112 sessions_.erase(it);
113 }
114 return Status::OK();
115 }
116
WorkerSessionForSessionLocked(const string & session_handle,std::shared_ptr<WorkerSession> * out_session)117 Status SessionMgr::WorkerSessionForSessionLocked(
118 const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
119 if (session_handle.empty()) {
120 *out_session = legacy_session_;
121 } else {
122 auto it = sessions_.find(session_handle);
123 if (it == sessions_.end()) {
124 return errors::Aborted("Session handle is not found: ", session_handle,
125 ". Possibly this worker (\"",
126 legacy_session_->worker_name,
127 "\") just restarted.");
128 } else {
129 *out_session = it->second;
130 }
131 }
132 return Status::OK();
133 }
134
WorkerSessionForSession(const string & session_handle,std::shared_ptr<WorkerSession> * out_session)135 Status SessionMgr::WorkerSessionForSession(
136 const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
137 mutex_lock l(mu_);
138 return WorkerSessionForSessionLocked(session_handle, out_session);
139 }
140
LegacySession()141 std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
142 return legacy_session_;
143 }
144
SetLogging(bool active)145 void SessionMgr::SetLogging(bool active) {
146 mutex_lock l(mu_);
147 this->is_logging_active_ = active;
148 // Legacy Session
149 if (legacy_session_) {
150 auto* worker_cache = legacy_session_->worker_cache.get();
151 if (worker_cache) {
152 worker_cache->SetLogging(active);
153 }
154 }
155
156 for (const auto& session_kv : sessions_) {
157 auto session = session_kv.second.get();
158 if (session) {
159 auto* worker_cache = session->worker_cache.get();
160 if (worker_cache) {
161 worker_cache->SetLogging(active);
162 }
163 }
164 }
165 }
166
RetrieveLogs(tensorflow::int64 step_id,LoggingResponse * response)167 void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
168 LoggingResponse* response) {
169 mutex_lock l(mu_);
170 // Legacy Session
171 if (legacy_session_) {
172 auto* worker_cache = legacy_session_->worker_cache.get();
173 if (worker_cache) {
174 auto step_stats = StepStats();
175 if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
176 auto* labeled_step_stats = response->add_step();
177 labeled_step_stats->set_step_id(step_id);
178 labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
179 }
180 }
181 }
182 for (const auto& session_kv : sessions_) {
183 auto session = session_kv.second.get();
184 if (session) {
185 auto* worker_cache = session->worker_cache.get();
186 if (worker_cache) {
187 auto step_stats = StepStats();
188 if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
189 auto* labeled_step_stats = response->add_step();
190 labeled_step_stats->set_step_id(step_id);
191 labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
192 }
193 }
194 }
195 }
196 }
197
ClearLogs()198 void SessionMgr::ClearLogs() {
199 mutex_lock l(mu_);
200 // Legacy Session
201 if (legacy_session_) {
202 auto* worker_cache = legacy_session_->worker_cache.get();
203 if (worker_cache) {
204 worker_cache->ClearLogs();
205 }
206 }
207
208 for (const auto& session_kv : sessions_) {
209 auto session = session_kv.second.get();
210 if (session) {
211 auto* worker_cache = session->worker_cache.get();
212 if (worker_cache) {
213 worker_cache->ClearLogs();
214 }
215 }
216 }
217 }
218 } // namespace tensorflow
219