1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_ 17 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_ 18 19 #include <unordered_map> 20 21 #include "tensorflow/core/common_runtime/eager/eager_executor.h" 22 #include "tensorflow/core/common_runtime/eager/tensor_handle.h" 23 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h" 24 #include "tensorflow/core/platform/mutex.h" 25 26 namespace tensorflow { 27 namespace eager { 28 29 // This class manages the states required to setup an eager cluster. 30 // TODO(fishx): Move remote state from context to this class. 31 class RemoteMgr { 32 public: RemoteMgr(bool is_master,EagerContext * ctx)33 RemoteMgr(bool is_master, EagerContext* ctx) 34 : is_master_(is_master), parent_(ctx) {} 35 ~RemoteMgr()36 ~RemoteMgr() { 37 for (const auto& entry : remote_tensor_handle_map_) { 38 entry.second->Unref(); 39 } 40 } 41 IsMaster()42 bool IsMaster() { return is_master_; } 43 44 void AddOperationOutputs( 45 const gtl::ArraySlice<tensorflow::TensorHandle*> handles, 46 int64 operation_id); 47 48 Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle, 49 tensorflow::TensorHandle** handle); 50 51 Status DeleteTensorHandle(const RemoteTensorHandleInternal& remote_handle); 52 53 // Helper function to create monotonically increasing ids unique to this 54 // context. NextOpId()55 uint64 NextOpId() { 56 DCHECK(is_master_); 57 mutex_lock l(next_id_mutex_); 58 return next_op_id_++; 59 } 60 61 // Serialize a remote TensorHandle to a RemoteTensorHandle. 62 Status SerializeRemoteTensorHandle( 63 TensorHandle* in, RemoteTensorHandle* out, Device* device, 64 const string& device_name, 65 const bool serialize_resource_dtype_and_shape = false); 66 67 // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote). 68 // The output holds a reference to the TensorHandle. 69 Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in, 70 TensorHandle** out); 71 72 EagerExecutor& GetOrCreateExecutorForStream(uint64 stream_id); 73 74 void DeleteExecutorForStream(uint64 stream_id); 75 76 protected: 77 mutex next_id_mutex_; 78 uint64 next_op_id_ GUARDED_BY(next_id_mutex_) = 1; 79 80 private: 81 // Returns the op_id and output_num if the given local TensorHandle exists in 82 // remote_tensor_handle_map_. 83 Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle, 84 int64* op_id, int32* output_num) 85 SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_); 86 87 Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle, 88 tensorflow::TensorHandle** handle) 89 SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_); 90 91 Status GetMirroredResourceShape( 92 const RemoteTensorHandleInternal& remote_handle, 93 std::vector<DtypeAndPartialTensorShape>* handle); 94 95 bool is_master_; 96 97 using RemoteTensorHandleMap = 98 gtl::FlatMap<RemoteTensorHandleInternal, tensorflow::TensorHandle*, 99 RemoteTensorHandleInternalHash, 100 RemoteTensorHandleInternalEquals>; 101 using MirroredResourceShapeMap = gtl::FlatMap< 102 RemoteTensorHandleInternal, std::vector<DtypeAndPartialTensorShape>, 103 RemoteTensorHandleInternalHash, RemoteTensorHandleInternalEquals>; 104 105 mutex remote_tensor_handle_mu_; 106 // This map maintains the TensorHandles that are required by remote workers 107 // in the cluster. Each map key is generated by the master, so it should be 108 // globally unique. This map owns references on the handles it contains. 109 RemoteTensorHandleMap remote_tensor_handle_map_ 110 GUARDED_BY(remote_tensor_handle_mu_); 111 112 mutex mirrored_resource_shape_mu_; 113 // This map maintains the data types and shapes of resource variables required 114 // by remote workers in the cluster. Each map key is generated by the master, 115 // so it should be globally unique. 116 MirroredResourceShapeMap mirrored_resource_shape_map_ 117 GUARDED_BY(mirrored_resource_shape_mu_); 118 119 EagerContext* parent_; // not owned. 120 121 mutex executor_map_mu_; 122 std::unordered_map<uint64, EagerExecutor> executor_map_ 123 GUARDED_BY(executor_map_mu_); 124 }; 125 126 } // namespace eager 127 } // namespace tensorflow 128 129 #endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_MGR_H_ 130