1syntax = "proto3"; 2 3package tensorflow.eager; 4 5import "tensorflow/core/framework/attr_value.proto"; 6import "tensorflow/core/framework/device_attributes.proto"; 7import "tensorflow/core/framework/function.proto"; 8import "tensorflow/core/framework/tensor.proto"; 9import "tensorflow/core/framework/tensor_shape.proto"; 10import "tensorflow/core/framework/versions.proto"; 11import "tensorflow/core/protobuf/remote_tensor_handle.proto"; 12import "tensorflow/core/protobuf/tensorflow_server.proto"; 13 14option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf"; 15 16// A proto representation of an eager operation. 17message Operation { 18 // A unique identifier for the operation. Set by the client so that the client 19 // can uniquely identify the outputs of the scheduled operation. 20 // 21 // In the initial implementation, sending duplicate IDs has undefined 22 // behaviour, but additional constraints may be placed upon this in the 23 // future. 24 int64 id = 1; 25 string name = 2; 26 repeated RemoteTensorHandle inputs = 3; 27 28 // Control Operation IDs that will be respected when ops are re-ordered by 29 // async execution. If async execution (+ op re-ordering) is not enabled, this 30 // should have no effect. 31 repeated int64 control_op_ids = 4; 32 map<string, AttrValue> attrs = 5; 33 string device = 6; 34 35 // Indicates whether the op is a component of a multi-device function. 36 bool is_component_function = 7; 37 // Set when is_component_function is true. It's initially generated 38 // when we create an FunctionLibraryRuntime::Options (negative value) and used 39 // to create Rendezvous for function execution. All components of a 40 // multi-device function should use the same step id to make sure that they 41 // can communicate through Send/Recv ops. 42 int64 func_step_id = 8; 43 // Indicates whether the op is a function. 44 bool is_function = 9; 45} 46 47message QueueItem { 48 // The remote executor should be able to handle either executing ops directly, 49 // or releasing any unused tensor handles, since the tensor lifetime is 50 // maintained by the client. 51 oneof item { 52 RemoteTensorHandle handle_to_decref = 1; 53 Operation operation = 2; 54 SendTensorOp send_tensor = 3; 55 // Takes a FunctionDef and makes it enqueable on the remote worker. 56 RegisterFunctionOp register_function = 4; 57 CleanupFunctionOp cleanup_function = 5; 58 } 59} 60 61message QueueResponse { 62 repeated TensorShapeProto shape = 1; 63} 64 65message CreateContextRequest { 66 // Identifies the full cluster, and this particular worker's position within. 67 ServerDef server_def = 1; 68 69 // Whether the ops on the worker should be executed synchronously or 70 // asynchronously. By default, ops are executed synchronously. 71 bool async = 2; 72 73 // Number of seconds to keep the context alive. If more than keep_alive_secs 74 // has passed since a particular context has been communicated with, it will 75 // be garbage collected. 76 int64 keep_alive_secs = 3; 77 78 // This is the version for all the ops that will be enqueued by the client. 79 VersionDef version_def = 4; 80 81 // Device attributes in the cluster 82 repeated DeviceAttributes cluster_device_attributes = 6; 83 84 // The ID of the created context. This is usually a randomly generated number, 85 // that will be used to identify the context in future requests to the 86 // service. Contexts are not persisted through server restarts. 87 // This ID will be used for all future communications as well. It is essential 88 // that both ends use this ID for selecting a rendezvous to get everything to 89 // match. 90 fixed64 context_id = 7; 91 92 // The view ID of the context. 93 fixed64 context_view_id = 8; 94 95 // For a multi device function, if false, eagerly copy all remote inputs to 96 // the default function device; if true, lazily copy remote inputs to their 97 // target devices after function instantiation to avoid redundant copies. 98 bool lazy_copy_remote_function_inputs = 9; 99 100 reserved 5; 101} 102 103message CreateContextResponse { 104 // List of devices that are locally accessible to the worker. 105 repeated DeviceAttributes device_attributes = 2; 106 107 reserved 1; 108} 109 110message UpdateContextRequest { 111 // Identifies the full cluster, and this particular worker's position within. 112 ServerDef server_def = 1; 113 114 // Device attributes in the cluster. 115 // If this field is empty, it indicates that this is a simple update request 116 // that only increments the cluster view ID and does not require changes to 117 // the workers it connects to. 118 repeated DeviceAttributes cluster_device_attributes = 2; 119 120 // The ID of the context to be updated. A context with the specified ID must 121 // already exist on the recepient server of this request. 122 fixed64 context_id = 3; 123 124 // The view ID of the context, which should be contiguously incremented when 125 // updating the same context. 126 fixed64 context_view_id = 4; 127} 128 129message UpdateContextResponse { 130 // List of devices that are locally accessible to the worker. 131 repeated DeviceAttributes device_attributes = 1; 132} 133 134message EnqueueRequest { 135 fixed64 context_id = 1; 136 137 repeated QueueItem queue = 3; 138} 139 140message EnqueueResponse { 141 // A single operation response for every item in the request. 142 repeated QueueResponse queue_response = 1; 143} 144 145message WaitQueueDoneRequest { 146 fixed64 context_id = 1; 147 148 // Ids to wait on. If empty, wait on everything currently pending. 149 repeated int64 op_id = 2; 150} 151 152message WaitQueueDoneResponse { 153 // TODO(nareshmodi): Consider adding NodeExecStats here to be able to 154 // propagate some stats. 155} 156 157message KeepAliveRequest { 158 fixed64 context_id = 1; 159} 160 161message KeepAliveResponse { 162 // If the requested context_id is on the remote host, set the context view ID. 163 fixed64 context_view_id = 1; 164} 165 166message CloseContextRequest { 167 fixed64 context_id = 1; 168 fixed64 context_view_id = 2; 169} 170 171message CloseContextResponse {} 172 173message RegisterFunctionOp { 174 FunctionDef function_def = 1; 175 176 // If true, it means that function_def is produced by graph partition during 177 // multi-device function instantiation. 178 bool is_component_function = 2; 179 180 // All necessary FunctionDefs and GradientDefs to expand `function_def`. 181 // When is_component_function is true, `function_def` could be a nested 182 // function, since some nodes in its parent's function body could be 183 // replaced with a new function by the graph optimization passes. No need to 184 // add FunctionDefs here to the function cache in EagerContext since they 185 // won't be executed as KernelAndDevices. 186 FunctionDefLibrary library = 3; 187} 188 189// Cleanup the step state of a multi-device function (e.g. tensors buffered by 190// a `Send` op but not picked up by its corresponding `Recv` op). 191message CleanupFunctionOp { 192 int64 step_id = 1; 193} 194 195message SendTensorOp { 196 // All remote tensors are identified by <Op ID, Output num>. To mimic this 197 // situation when directly sending tensors, we include an "artificial" op ID 198 // (which would have corresponded to the _Recv op when not using SendTensor). 199 int64 op_id = 1; 200 // The index within the repeated field is the output number that will help 201 // uniquely identify (along with the above op_id) the particular tensor. 202 repeated TensorProto tensors = 2; 203 204 // The device on which the tensors should be resident. 205 string device_name = 3; 206} 207 208//////////////////////////////////////////////////////////////////////////////// 209// 210// Eager Service defines a TensorFlow service that executes operations eagerly 211// on a set of local devices, on behalf of a remote Eager executor. 212// 213// The service impl will keep track of the various clients and devices it has 214// access to and allows the client to enqueue ops on any devices that it is able 215// to access and schedule data transfers from/to any of the peers. 216// 217// A client can generate multiple contexts to be able to independently execute 218// operations, but cannot share data between the two contexts. 219// 220// NOTE: Even though contexts generated by clients should be independent, the 221// lower level tensorflow execution engine is not, so they might share some data 222// (e.g. a Device's ResourceMgr). 223// 224//////////////////////////////////////////////////////////////////////////////// 225service EagerService { 226 // This initializes the worker, informing it about the other workers in the 227 // cluster and exchanging authentication tokens which will be used in all 228 // other RPCs to detect whether the worker has restarted. 229 rpc CreateContext(CreateContextRequest) returns (CreateContextResponse); 230 231 // This updates the eager context on an existing worker when updating the set 232 // of servers in a distributed eager cluster. 233 rpc UpdateContext(UpdateContextRequest) returns (UpdateContextResponse); 234 235 // This takes a list of Execute and DeleteTensorHandle operations and enqueues 236 // (in async mode) or executes (in sync mode) them on the remote server. 237 // All outputs of ops which were not explicitly deleted with 238 // DeleteTensorHandle entries will be assumed to be alive and are usable by 239 // future calls to Enqueue. 240 rpc Enqueue(EnqueueRequest) returns (EnqueueResponse); 241 242 // A streaming version of Enqueue. 243 // Current server implementation sends one response per received request. 244 // The benefit for using a streaming version is that subsequent requests 245 // can be sent without waiting for a response to the previous request. This 246 // synchronization is required in the regular Enqueue call because gRPC does 247 // not guarantee to preserve request order. 248 rpc StreamingEnqueue(stream EnqueueRequest) returns (stream EnqueueResponse); 249 250 // Takes a set of op IDs and waits until those ops are done. Returns any error 251 // in the stream so far. 252 rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse); 253 254 // Contexts are always created with a deadline and no RPCs within a deadline 255 // will trigger a context garbage collection. KeepAlive calls can be used to 256 // delay this. It can also be used to validate the existence of a context ID 257 // on remote eager worker. If the context is on remote worker, return the same 258 // ID and the current context view ID. This is useful for checking if the 259 // remote worker (potentially with the same task name and hostname / port) is 260 // replaced with a new process. 261 rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse); 262 263 // Closes the context. No calls to other methods using the existing context ID 264 // are valid after this. 265 rpc CloseContext(CloseContextRequest) returns (CloseContextResponse); 266} 267