• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1syntax = "proto3";
2
3package tensorflow.eager;
4
5import "tensorflow/core/framework/attr_value.proto";
6import "tensorflow/core/framework/device_attributes.proto";
7import "tensorflow/core/framework/function.proto";
8import "tensorflow/core/framework/tensor.proto";
9import "tensorflow/core/framework/tensor_shape.proto";
10import "tensorflow/core/framework/versions.proto";
11import "tensorflow/core/protobuf/remote_tensor_handle.proto";
12import "tensorflow/core/protobuf/tensorflow_server.proto";
13
14option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
15
16// A proto representation of an eager operation.
17message Operation {
18  // A unique identifier for the operation. Set by the client so that the client
19  // can uniquely identify the outputs of the scheduled operation.
20  //
21  // In the initial implementation, sending duplicate IDs has undefined
22  // behaviour, but additional constraints may be placed upon this in the
23  // future.
24  int64 id = 1;
25  string name = 2;
26  repeated RemoteTensorHandle inputs = 3;
27
28  // Control Operation IDs that will be respected when ops are re-ordered by
29  // async execution. If async execution (+ op re-ordering) is not enabled, this
30  // should have no effect.
31  repeated int64 control_op_ids = 4;
32  map<string, AttrValue> attrs = 5;
33  string device = 6;
34
35  // Indicates whether the op is a component of a multi-device function.
36  bool is_component_function = 7;
37  // Set when is_component_function is true. It's initially generated
38  // when we create an FunctionLibraryRuntime::Options (negative value) and used
39  // to create Rendezvous for function execution. All components of a
40  // multi-device function should use the same step id to make sure that they
41  // can communicate through Send/Recv ops.
42  int64 func_step_id = 8;
43  // Indicates whether the op is a function.
44  bool is_function = 9;
45}
46
47message QueueItem {
48  // The remote executor should be able to handle either executing ops directly,
49  // or releasing any unused tensor handles, since the tensor lifetime is
50  // maintained by the client.
51  oneof item {
52    RemoteTensorHandle handle_to_decref = 1;
53    Operation operation = 2;
54    SendTensorOp send_tensor = 3;
55    // Takes a FunctionDef and makes it enqueable on the remote worker.
56    RegisterFunctionOp register_function = 4;
57    CleanupFunctionOp cleanup_function = 5;
58  }
59}
60
61message QueueResponse {
62  repeated TensorShapeProto shape = 1;
63}
64
65message CreateContextRequest {
66  // Identifies the full cluster, and this particular worker's position within.
67  ServerDef server_def = 1;
68
69  // Whether the ops on the worker should be executed synchronously or
70  // asynchronously. By default, ops are executed synchronously.
71  bool async = 2;
72
73  // Number of seconds to keep the context alive. If more than keep_alive_secs
74  // has passed since a particular context has been communicated with, it will
75  // be garbage collected.
76  int64 keep_alive_secs = 3;
77
78  // This is the version for all the ops that will be enqueued by the client.
79  VersionDef version_def = 4;
80
81  // Device attributes in the cluster
82  repeated DeviceAttributes cluster_device_attributes = 6;
83
84  // The ID of the created context. This is usually a randomly generated number,
85  // that will be used to identify the context in future requests to the
86  // service. Contexts are not persisted through server restarts.
87  // This ID will be used for all future communications as well. It is essential
88  // that both ends use this ID for selecting a rendezvous to get everything to
89  // match.
90  fixed64 context_id = 7;
91
92  // The view ID of the context.
93  fixed64 context_view_id = 8;
94
95  // For a multi device function, if false, eagerly copy all remote inputs to
96  // the default function device; if true, lazily copy remote inputs to their
97  // target devices after function instantiation to avoid redundant copies.
98  bool lazy_copy_remote_function_inputs = 9;
99
100  reserved 5;
101}
102
103message CreateContextResponse {
104  // List of devices that are locally accessible to the worker.
105  repeated DeviceAttributes device_attributes = 2;
106
107  reserved 1;
108}
109
110message UpdateContextRequest {
111  // Identifies the full cluster, and this particular worker's position within.
112  ServerDef server_def = 1;
113
114  // Device attributes in the cluster.
115  // If this field is empty, it indicates that this is a simple update request
116  // that only increments the cluster view ID and does not require changes to
117  // the workers it connects to.
118  repeated DeviceAttributes cluster_device_attributes = 2;
119
120  // The ID of the context to be updated. A context with the specified ID must
121  // already exist on the recepient server of this request.
122  fixed64 context_id = 3;
123
124  // The view ID of the context, which should be contiguously incremented when
125  // updating the same context.
126  fixed64 context_view_id = 4;
127}
128
129message UpdateContextResponse {
130  // List of devices that are locally accessible to the worker.
131  repeated DeviceAttributes device_attributes = 1;
132}
133
134message EnqueueRequest {
135  fixed64 context_id = 1;
136
137  repeated QueueItem queue = 3;
138}
139
140message EnqueueResponse {
141  // A single operation response for every item in the request.
142  repeated QueueResponse queue_response = 1;
143}
144
145message WaitQueueDoneRequest {
146  fixed64 context_id = 1;
147
148  // Ids to wait on. If empty, wait on everything currently pending.
149  repeated int64 op_id = 2;
150}
151
152message WaitQueueDoneResponse {
153  // TODO(nareshmodi): Consider adding NodeExecStats here to be able to
154  // propagate some stats.
155}
156
157message KeepAliveRequest {
158  fixed64 context_id = 1;
159}
160
161message KeepAliveResponse {
162  // If the requested context_id is on the remote host, set the context view ID.
163  fixed64 context_view_id = 1;
164}
165
166message CloseContextRequest {
167  fixed64 context_id = 1;
168  fixed64 context_view_id = 2;
169}
170
171message CloseContextResponse {}
172
173message RegisterFunctionOp {
174  FunctionDef function_def = 1;
175
176  // If true, it means that function_def is produced by graph partition during
177  // multi-device function instantiation.
178  bool is_component_function = 2;
179
180  // All necessary FunctionDefs and GradientDefs to expand `function_def`.
181  // When is_component_function is true, `function_def` could be a nested
182  // function, since some nodes in its parent's function body could be
183  // replaced with a new function by the graph optimization passes. No need to
184  // add FunctionDefs here to the function cache in EagerContext since they
185  // won't be executed as KernelAndDevices.
186  FunctionDefLibrary library = 3;
187}
188
189// Cleanup the step state of a multi-device function (e.g. tensors buffered by
190// a `Send` op but not picked up by its corresponding `Recv` op).
191message CleanupFunctionOp {
192  int64 step_id = 1;
193}
194
195message SendTensorOp {
196  // All remote tensors are identified by <Op ID, Output num>. To mimic this
197  // situation when directly sending tensors, we include an "artificial" op ID
198  // (which would have corresponded to the _Recv op when not using SendTensor).
199  int64 op_id = 1;
200  // The index within the repeated field is the output number that will help
201  // uniquely identify (along with the above op_id) the particular tensor.
202  repeated TensorProto tensors = 2;
203
204  // The device on which the tensors should be resident.
205  string device_name = 3;
206}
207
208////////////////////////////////////////////////////////////////////////////////
209//
210// Eager Service defines a TensorFlow service that executes operations eagerly
211// on a set of local devices, on behalf of a remote Eager executor.
212//
213// The service impl will keep track of the various clients and devices it has
214// access to and allows the client to enqueue ops on any devices that it is able
215// to access and schedule data transfers from/to any of the peers.
216//
217// A client can generate multiple contexts to be able to independently execute
218// operations, but cannot share data between the two contexts.
219//
220// NOTE: Even though contexts generated by clients should be independent, the
221// lower level tensorflow execution engine is not, so they might share some data
222// (e.g. a Device's ResourceMgr).
223//
224////////////////////////////////////////////////////////////////////////////////
225service EagerService {
226  // This initializes the worker, informing it about the other workers in the
227  // cluster and exchanging authentication tokens which will be used in all
228  // other RPCs to detect whether the worker has restarted.
229  rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);
230
231  // This updates the eager context on an existing worker when updating the set
232  // of servers in a distributed eager cluster.
233  rpc UpdateContext(UpdateContextRequest) returns (UpdateContextResponse);
234
235  // This takes a list of Execute and DeleteTensorHandle operations and enqueues
236  // (in async mode) or executes (in sync mode) them on the remote server.
237  // All outputs of ops which were not explicitly deleted with
238  // DeleteTensorHandle entries will be assumed to be alive and are usable by
239  // future calls to Enqueue.
240  rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);
241
242  // A streaming version of Enqueue.
243  // Current server implementation sends one response per received request.
244  // The benefit for using a streaming version is that subsequent requests
245  // can be sent without waiting for a response to the previous request. This
246  // synchronization is required in the regular Enqueue call because gRPC does
247  // not guarantee to preserve request order.
248  rpc StreamingEnqueue(stream EnqueueRequest) returns (stream EnqueueResponse);
249
250  // Takes a set of op IDs and waits until those ops are done. Returns any error
251  // in the stream so far.
252  rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
253
254  // Contexts are always created with a deadline and no RPCs within a deadline
255  // will trigger a context garbage collection. KeepAlive calls can be used to
256  // delay this. It can also be used to validate the existence of a context ID
257  // on remote eager worker. If the context is on remote worker, return the same
258  // ID and the current context view ID. This is useful for checking if the
259  // remote worker (potentially with the same task name and hostname / port) is
260  // replaced with a new process.
261  rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);
262
263  // Closes the context. No calls to other methods using the existing context ID
264  // are valid after this.
265  rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);
266}
267