1syntax = "proto3"; 2 3package tensorflow.eager; 4 5import "tensorflow/core/framework/attr_value.proto"; 6import "tensorflow/core/framework/device_attributes.proto"; 7import "tensorflow/core/framework/function.proto"; 8import "tensorflow/core/framework/versions.proto"; 9import "tensorflow/core/protobuf/tensorflow_server.proto"; 10import "tensorflow/core/framework/tensor_shape.proto"; 11import "tensorflow/core/framework/tensor.proto"; 12 13message RemoteTensorHandle { 14 // The ID of the operation that produced this tensor. 15 int64 op_id = 1; 16 // The index into the outputs of the operation that produced this tensor. 17 int32 output_num = 2; 18} 19 20// A proto representation of an eager operation. 21message Operation { 22 // A unique identifier for the operation. Set by the client so that the client 23 // can uniquely identify the outputs of the scheduled operation. 24 // 25 // In the initial implementation, sending duplicate IDs has undefined 26 // behaviour, but additional constraints may be placed upon this in the 27 // future. 28 int64 id = 1; 29 string name = 2; 30 repeated RemoteTensorHandle inputs = 3; 31 32 // Control Operation IDs that will be respected when ops are re-ordered by 33 // async execution. If async execution (+ op re-ordering) is not enabled, this 34 // should have no effect. 35 repeated int64 control_op_ids = 4; 36 map<string, AttrValue> attrs = 5; 37 string device = 6; 38} 39 40message QueueItem { 41 // The remote executor should be able to handle either executing ops directly, 42 // or releasing any unused tensor handles, since the tensor lifetime is 43 // maintained by the client. 44 oneof item { 45 RemoteTensorHandle handle_to_decref = 1; 46 Operation operation = 2; 47 } 48} 49 50message QueueResponse { 51 repeated TensorShapeProto shape = 1; 52} 53 54message CreateContextRequest { 55 // Identifies the full cluster, and this particular worker's position within. 56 ServerDef server_def = 1; 57 58 // Whether the ops on the worker should be executed synchronously or 59 // asynchronously. By default, ops are executed synchronously. 60 bool async = 2; 61 62 // Number of seconds to keep the context alive. If more than keep_alive_secs 63 // has passed since a particular context has been communicated with, it will 64 // be garbage collected. 65 int64 keep_alive_secs = 3; 66 67 // This is the version for all the ops that will be enqueued by the client. 68 VersionDef version_def = 4; 69 70 // This ID will be used for all future communications. It is essential that 71 // both ends use this ID for selecting a rendezvous to get everything to 72 // match. 73 int64 rendezvous_id = 5; 74} 75 76message CreateContextResponse { 77 // The ID of the created context. This is usually a randomly generated number, 78 // that will be used to identify the context in future requests to the 79 // service. Contexts are not persisted through server restarts. 80 fixed64 context_id = 1; 81 82 // List of devices that are locally accessible to the worker. 83 repeated DeviceAttributes device_attributes = 2; 84} 85 86message EnqueueRequest { 87 fixed64 context_id = 1; 88 89 repeated QueueItem queue = 3; 90} 91 92message EnqueueResponse { 93 // A single operation response for every item in the request. 94 repeated QueueResponse queue_response = 1; 95} 96 97message WaitQueueDoneRequest { 98 fixed64 context_id = 1; 99 100 // Ids to wait on. If empty, wait on everything currently pending. 101 repeated int64 op_id = 2; 102} 103 104message WaitQueueDoneResponse { 105 // TODO(nareshmodi): Consider adding NodeExecStats here to be able to 106 // propagate some stats. 107} 108 109message KeepAliveRequest { 110 fixed64 context_id = 1; 111} 112 113message KeepAliveResponse { 114} 115 116message CloseContextRequest { 117 fixed64 context_id = 1; 118} 119 120message CloseContextResponse { 121} 122 123message RegisterFunctionRequest { 124 fixed64 context_id = 1; 125 126 FunctionDef function_def = 2; 127} 128 129message RegisterFunctionResponse { 130} 131 132message SendTensorRequest { 133 fixed64 context_id = 1; 134 135 // All remote tensors are identified by <Op ID, Output num>. To mimic this 136 // situation when directly sending tensors, we include an "artificial" op ID 137 // (which would have corresponded to the _Recv op when not using SendTensor). 138 int64 op_id = 2; 139 // The index within the repeated field is the output number that will help 140 // uniquely identify (along with the above op_id) the particular tensor. 141 repeated TensorProto tensors = 3; 142 143 // The device on which the tensors should be resident. 144 string device_name = 4; 145} 146 147message SendTensorResponse { 148} 149 150//////////////////////////////////////////////////////////////////////////////// 151// 152// Eager Service defines a TensorFlow service that executes operations eagerly 153// on a set of local devices, on behalf of a remote Eager executor. 154// 155// The service impl will keep track of the various clients and devices it has 156// access to and allows the client to enqueue ops on any devices that it is able 157// to access and schedule data transfers from/to any of the peers. 158// 159// A client can generate multiple contexts to be able to independently execute 160// operations, but cannot share data between the two contexts. 161// 162// NOTE: Even though contexts generated by clients should be independent, the 163// lower level tensorflow execution engine is not, so they might share some data 164// (e.g. a Device's ResourceMgr). 165// 166//////////////////////////////////////////////////////////////////////////////// 167service EagerService { 168 // This initializes the worker, informing it about the other workers in the 169 // cluster and exchanging authentication tokens which will be used in all 170 // other RPCs to detect whether the worker has restarted. 171 rpc CreateContext(CreateContextRequest) returns (CreateContextResponse); 172 173 // This takes a list of Execute and DeleteTensorHandle operations and enqueues 174 // (in async mode) or executes (in sync mode) them on the remote server. 175 // All outputs of ops which were not explicitly deleted with 176 // DeleteTensorHandle entries will be assumed to be alive and are usable by 177 // future calls to Enqueue. 178 rpc Enqueue(EnqueueRequest) returns (EnqueueResponse); 179 180 // Takes a set of op IDs and waits until those ops are done. Returns any error 181 // in the stream so far. 182 rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse); 183 184 // Contexts are always created with a deadline and no RPCs within a deadline 185 // will trigger a context garbage collection. KeepAlive calls can be used to 186 // delay this. 187 rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse); 188 189 // Closes the context. No calls to other methods using the existing context ID 190 // are valid after this. 191 rpc CloseContext(CloseContextRequest) returns (CloseContextResponse); 192 193 // Takes a FunctionDef and makes it enqueable on the remote worker. 194 rpc RegisterFunction(RegisterFunctionRequest) 195 returns (RegisterFunctionResponse); 196 197 // An RPC to push tensors to the server. At times, certain environments don't 198 // allow the server to connect back to the client. 199 rpc SendTensor(SendTensorRequest) returns (SendTensorResponse); 200} 201