• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16syntax = "proto3";
17
18package tensorflow;
19
20import "google/protobuf/any.proto";
21import "tensorflow/core/framework/cost_graph.proto";
22import "tensorflow/core/framework/device_attributes.proto";
23import "tensorflow/core/framework/graph.proto";
24import "tensorflow/core/framework/step_stats.proto";
25import "tensorflow/core/framework/tensor.proto";
26import "tensorflow/core/framework/tensor_shape.proto";
27import "tensorflow/core/framework/types.proto";
28import "tensorflow/core/protobuf/config.proto";
29import "tensorflow/core/protobuf/debug.proto";
30import "tensorflow/core/protobuf/error_codes.proto";
31import "tensorflow/core/protobuf/named_tensor.proto";
32import "tensorflow/core/protobuf/tensorflow_server.proto";
33
34option cc_enable_arenas = true;
35option java_outer_classname = "WorkerProtos";
36option java_multiple_files = true;
37option java_package = "org.tensorflow.distruntime";
38option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
39
40////////////////////////////////////////////////////////////////////////////////
41//
42// GetStatus method request/response messages
43//
44////////////////////////////////////////////////////////////////////////////////
45
46message GetStatusRequest {}
47
48message GetStatusResponse {
49  repeated DeviceAttributes device_attributes = 1;
50}
51
52////////////////////////////////////////////////////////////////////////////////
53//
54// CreateSession method request/response messages
55//
56// For each session,
57//
58////////////////////////////////////////////////////////////////////////////////
59
60message CreateWorkerSessionRequest {
61  // Sessions are identified by a given handle.
62  string session_handle = 1;
63
64  // Defines the configuration of a TensorFlow worker.
65  ServerDef server_def = 2;
66
67  // If true, any resources such as Variables used in the session will not be
68  // shared with other sessions.
69  bool isolate_session_state = 3;
70
71  // The device attributes of all the devices in the cluster.
72  repeated DeviceAttributes cluster_device_attributes = 4;
73
74  // The master task name from which the request is sent.
75  string master_task = 5;
76
77  // The incarnation ID of the master task local CPU device.
78  // If the target worker already has a WorkerSession created previously with
79  // the same master task name but a different incarnation, it usually indicates
80  // that the previous master failed before deleting the WorkerSession on the
81  // worker. To prevent memory leaks, the worker should garbage collect the old
82  // WorkerSessions.
83  int64 master_incarnation = 6;
84
85  reserved 7;  // Deprecated config that is embedded within server_def now.
86}
87
88message CreateWorkerSessionResponse {}
89
90////////////////////////////////////////////////////////////////////////////////
91//
92// DeleteSession method request/response messages
93//
94// Deletes all worker-side state associated with the given session handle.
95//
96////////////////////////////////////////////////////////////////////////////////
97
98message DeleteWorkerSessionRequest {
99  // Sessions are identified by a given handle.
100  string session_handle = 1;
101}
102
103message DeleteWorkerSessionResponse {}
104
105////////////////////////////////////////////////////////////////////////////////
106//
107// RegisterGraph method request/response messages
108//
109// For each session, after the master placed every node on a device,
110// it partitions the whole graph into many subgraphs. All the nodes in
111// a subgraph were in the same worker, but potentially on many devices
112// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The
113// master registers subgraphs for a worker before running any steps. A
114// successful registration returns a graph handle to be used in latter
115// RunGraph requests.
116//
117////////////////////////////////////////////////////////////////////////////////
118
119message RegisterGraphRequest {
120  // Subgraphs are scoped within one session.
121  string session_handle = 1;
122
123  // Set to true if `CreateWorkerSession` was called for `session_handle`.
124  bool create_worker_session_called = 6;
125
126  // "graph_def" has the subgraph of nodes for this worker, with each node
127  // having its device_name filled in.
128  GraphDef graph_def = 2;
129
130  // True iff the graph (before partitioning) contains control flow nodes.
131  //
132  // As of 01/11/2015, this is no longer set by clients.
133  bool has_control_flow = 3 [deprecated = true];
134
135  // Configuration options for the session in which this graph was created.
136  GraphOptions graph_options = 4;
137
138  // Field(s) used by TensorFlow Debugger (tfdbg).
139  DebugOptions debug_options = 5;
140
141  // If graph_def contains any collective ops this must be a positive
142  // integer used to coordinate execution with other graphs.  All
143  // graphs in a distributed execution with the same
144  // collective_graph_key will coordinate to use the same step_id
145  // concurrently so that BufRendezvous entries will make the correct
146  // values accessible.
147  int64 collective_graph_key = 7;
148
149  // ConfigProto from the session in which this graph was created.
150  // Contains additional parameters beyond graph_options, including
151  // the name of the requested executor.
152  ConfigProto config_proto = 8;
153}
154
155message RegisterGraphResponse {
156  // If the registration succeeds, returns an opaque graph_handle to
157  // the master. The master calls RunGraph with graph_handle to
158  // compute different steps.
159  string graph_handle = 1;
160}
161
162////////////////////////////////////////////////////////////////////////////////
163//
164// DeregisterGraph method request/response messages
165//
166// The master deregisters the given graph_handle when the graph is no
167// longer needed (e.g., the overall graph is re-scheduled and nodes
168// are re-placed).
169//
170// The worker deregisters a graph_handle automatically according to on
171// a TTL-base policy in case of master restarts.
172//
173////////////////////////////////////////////////////////////////////////////////
174
175message DeregisterGraphRequest {
176  // The session_handle used when registering the graph. If session_handle is
177  // empty, a single global namespace is used.
178  string session_handle = 2;
179
180  // Set to true if `CreateWorkerSession` was called for `session_handle`.
181  bool create_worker_session_called = 3;
182
183  // REQUIRED: graph_handle must be returned by a RegisterGraph call
184  // to the same WorkerService.
185  string graph_handle = 1;
186}
187
188message DeregisterGraphResponse {
189  // TODO(mrry): Optionally add summary stats for the graph.
190}
191
192////////////////////////////////////////////////////////////////////////////////
193//
194// CleanupAll method request/response messages
195//
196////////////////////////////////////////////////////////////////////////////////
197
198message CleanupAllRequest {
199  // A list of container names.
200  //
201  // If 'container' is not empty, releases resources in the given
202  // containers in all devices.
203  //
204  // If 'container' is empty, releases resources in the default
205  // container in all devices.
206  repeated string container = 1;
207}
208
209message CleanupAllResponse {}
210
211////////////////////////////////////////////////////////////////////////////////
212//
213// RunGraph request / response messages
214//
215// The worker executes all subgraphs registered under graph_handle.
216// RunGraph returns after the execution finishes or an error is
217// encountered.
218// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for
219// partial graph execution.
220//
221////////////////////////////////////////////////////////////////////////////////
222
223// Options specific to the execution of a single step.
224message ExecutorOpts {
225  bool record_costs = 1;
226  bool record_timeline = 3;
227  bool record_partition_graphs = 4;
228  bool report_tensor_allocations_upon_oom = 5;
229}
230
231message RunGraphRequest {
232  // session_handle is the master-generated unique id for this session.
233  // If session_handle is non-empty, it must be the same as used when
234  // registering the graph. If it is empty, a single global namespace is used to
235  // search for the graph_handle.
236  string session_handle = 8;
237
238  // Set to true if `CreateWorkerSession` was called for `session_handle`.
239  bool create_worker_session_called = 10;
240
241  // REQUIRED: graph_handle must be returned by a RegisterGraph call
242  // to the same WorkerService.
243  string graph_handle = 1;
244
245  // A unique ID to distinguish different runs of the same graph.
246  //
247  // The master generates a global unique `step_id` to distinguish
248  // different runs of the graph computation. Subgraphs communicate
249  // (e.g., send/recv ops) with each other using `step_id` to
250  // distinguish tensors generated by different runs.
251  int64 step_id = 2;
252
253  // Options for this step.
254  ExecutorOpts exec_opts = 5;
255
256  // Runs the graph.
257  //
258  // Sends the tensors in "send" into the graph before the run and
259  // fetches the keys into `RunGraphResponse.recv` after the run.
260  repeated NamedTensorProto send = 3;
261  repeated string recv_key = 4;
262
263  // True if the RunGraphRequest is a partial run request.
264  bool is_partial = 6;
265  // True if this is the last partial run request in a sequence of requests.
266  bool is_last_partial_run = 7;
267
268  // If true then some errors, e.g., execution errors that have long
269  // error messages, may return an OK RunGraphResponse with the actual
270  // error saved in the status_code/status_error_message fields of the
271  // response body. This is a workaround since the RPC subsystem may
272  // truncate long metadata messages.
273  bool store_errors_in_response_body = 9;
274
275  // Unique identifier for this request. Every RunGraphRequest must have a
276  // unique request_id, and retried RunGraphRequests must have the same
277  // request_id. If request_id is zero, retry detection is disabled.
278  //
279  // Retried RunGraphRequests are problematic because they may issue a
280  // RecvTensor that will have no corresponding sender and will wait forever.
281  // Workers use request_ids to reject retried RunGraph requests instead of
282  // waiting forever.
283  int64 request_id = 11;
284
285  // Next: 12
286}
287
288message RunGraphResponse {
289  // A list of tensors corresponding to those requested by
290  // `RunGraphRequest.recv_key`.
291  repeated NamedTensorProto recv = 1;
292
293  // If the request asked for execution stats, the cost graph, or the partition
294  // graphs, these are returned here.
295  // TODO(suharshs): Package these in a RunMetadata instead.
296  StepStats step_stats = 2;
297  CostGraphDef cost_graph = 3;
298  repeated GraphDef partition_graph = 4;
299
300  // If store_errors_in_response_body is true in the request, then
301  // optionally the server may return an OK status for the RPC and
302  // fill the true status into the fields below, to allow for messages
303  // that are too long to fit in metadata.
304  error.Code status_code = 5;
305  string status_error_message = 6;
306}
307
308////////////////////////////////////////////////////////////////////////////////
309//
310// CleanupGraph method request/response messages
311//
312// After the master receives RunGraph responses from all workers, the
313// master instructs every worker to cleanup any remaining state of a
314// step (e.g. tensors buffered by a `Send` op but not picked up by
315// other workers). The master does not necessarily need to wait for
316// completion of CleanupGraph calls.
317//
318// Workers should cleanup step states automatically according to a
319// TTL-based policy in case of master restarts.
320//
321////////////////////////////////////////////////////////////////////////////////
322
323message CleanupGraphRequest {
324  int64 step_id = 1;
325}
326
327message CleanupGraphResponse {}
328
329////////////////////////////////////////////////////////////////////////////////
330//
331// RecvTensor method request/response messages
332//
333////////////////////////////////////////////////////////////////////////////////
334
335message RecvTensorRequest {
336  // The step in which the tensor will be produced.
337  //
338  // REQUIRED: This must eventually correspond to the `step_id` passed
339  // into a RunGraph call on the same WorkerService.
340  int64 step_id = 1;
341
342  // A key identifying the channel to receive tensors from. A RecvTensor request
343  // retrieves one tensor from the channel, but multiple tensors can be sent and
344  // received over the same channel with multiple RecvTensor requests. See
345  // rendezvous.h for details.
346  string rendezvous_key = 2;
347
348  // If true, use an out-of-band DMA mechanism to transfer the
349  // received tensor.
350  bool dma_ok = 3;
351
352  // Optional information on client-side device locality.
353  DeviceLocality client_locality = 4;
354
355  // Optional information on server-side device locality.
356  DeviceLocality server_locality = 5;
357
358  // Optional information needed by the RPC subsystem.
359  google.protobuf.Any transport_options = 6;
360
361  // Unique identifier for this request. Every RecvTensorRequest must have a
362  // unique request_id, and retried RecvTensorRequests must have the same
363  // request_id. If request_id is zero, retry detection and response cache
364  // are disabled.
365  //
366  // Retried RecvTensorRequests are problematic because a RecvTensor with no
367  // corresponding sender will wait forever, and the tensor may have been
368  // delivered to a previous retry. Workers use request_ids to reject retried
369  // RecvTensor requests instead of waiting forever.
370  int64 request_id = 7;
371}
372
373message RecvTensorResponse {
374  // The tensor as a proto.
375  TensorProto tensor = 1;
376
377  // If true, this tensor was the output of a dead node, and the
378  // content is invalid.
379  bool is_dead = 2;
380
381  // The time at which tensor was available and started to be returned.
382  int64 send_start_micros = 3;
383
384  // Optional additional information about how to receive the tensor,
385  // e.g. in the event that `RecvTensorRequest.dma_ok` was true.
386  google.protobuf.Any transport_options = 4;
387
388  // Whether the receiver should send a MarkRecvFinishedRequest to the sender
389  // to ack the message.
390  bool require_ack = 5;
391}
392
393// Message for managing the response cache maintained on the sender side.
394// Currently only used by the gRPC worker service.
395message MarkRecvFinishedRequest {
396  int64 request_id = 1;
397}
398
399message MarkRecvFinishedResponse {}
400
401////////////////////////////////////////////////////////////////////////////////
402//
403// Logging method request/response messages
404//
405// NOTE(mrry): This feature is not supported in the open-source
406// version, and these messages are expected to change.
407//
408////////////////////////////////////////////////////////////////////////////////
409
410// Out-of-band request to begin or end logging, or
411// to retrieve logs for particular steps.
412message LoggingRequest {
413  // If true, RPC logging will be enabled.
414  bool enable_rpc_logging = 1;
415
416  // If true, RPC logging will be disabled.
417  bool disable_rpc_logging = 4;
418
419  // If true, discard any saved logging data (for all steps).
420  bool clear = 2;
421
422  // When set, requests all saved log data pertaining to the step.
423  // Any log data retrieved is eliminated from the store and cannot be
424  // retrieved again.
425  repeated int64 fetch_step_id = 3;
426}
427
428message LabeledStepStats {
429  int64 step_id = 1;
430  StepStats step_stats = 2;
431}
432
433message LoggingResponse {
434  repeated LabeledStepStats step = 1;
435}
436
437////////////////////////////////////////////////////////////////////////////////
438//
439// Tracing method request/response messages
440//
441// NOTE(mrry): This feature is not supported in the open-source
442// version, and these messages are expected to change.
443//
444////////////////////////////////////////////////////////////////////////////////
445
446message TraceOpts {
447  // Length of the trace to be taken, in seconds.
448  double duration = 1;
449  // If true, capture step profile locally in each worker. Currently
450  // unimplemented.
451  bool use_step_profiler = 2;
452  // If true, capture kernel events from each worker.
453  bool use_kernel_profiler = 3;
454  // If true, capture extended profiling events from TensorFlow process.
455  bool use_extended_profiler = 4;
456  // If true, capture GPU profiling events locally on each
457  // machine. Currently unimplemented.
458  bool use_gpu_profiler = 5;
459  // If true, collect sampled profile events. Currently unimplemented.
460  bool use_sample_profiler = 6;
461}
462
463// Out-of-band request to configure distributed tracing.
464message TracingRequest {
465  TraceOpts options = 1;
466}
467
468message TracingResponse {}
469
470////////////////////////////////////////////////////////////////////////////////
471//
472// Raw data transfers in support of Collective Ops.
473// These methods are experimental and subject to change.
474//
475// The intention is to allow collectives to take advantage of the most
476// efficient methods available on a platform, e.g. RDMA, and not be
477// constrained to use the RPC system in use by other methods.
478//
479////////////////////////////////////////////////////////////////////////////////
480
481message RecvBufRequest {
482  // Use of the fields below may vary by implementation.  For example
483  // the buf_ptr and num_bytes may be set only for local operations and
484  // not sent on the wire, or only sent on the wire in one direction.
485
486  // Used at server side to find the correct BufRendezvous.
487  int64 step_id = 1;
488
489  // Arbitrary string identifying a BufRendezvous entry.
490  string buf_rendezvous_key = 2;
491
492  // Size of value expected, must agree with BufRendezvous entry.
493  int64 num_bytes = 3;
494
495  // When RDMA is in use, address of destination field on client.
496  fixed64 buf_ptr = 4;
497
498  // Optional information on client-side device locality.
499  DeviceLocality client_locality = 5;
500
501  // Optional information on server-side device locality.
502  DeviceLocality server_locality = 6;
503
504  // Optional, implementation-specific data.
505  google.protobuf.Any transport_options = 7;
506  // For annotating timeline and device incarnation check.
507  string src_device = 8;
508  // Optional, for annotating the timeline.
509  string dst_device = 9;
510
511  // Depending on the RPC system in use, it may be necessary to set this
512  // id to detect resends of RPCs where the server is not aware that
513  // the prior RPC failed.
514  int64 request_id = 10;
515
516  // Incarnation number of the source device, used to detect worker failures.
517  uint64 src_incarnation = 11;
518}
519
520message RecvBufResponse {
521  // Use of the fields below may vary by implementation.  Comments give
522  // intended use.
523
524  fixed64 buf_ptr = 1;  // Address of source field on server.
525  int64 num_bytes = 2;  // Byte length of buf_ptr field, if set.
526  bool is_dead = 3;     // True if value is 'dead' like a tensor.
527  // Optional, implementation-specific data.
528  google.protobuf.Any transport_options = 4;
529  // Optional, for timeline.
530  int64 send_start_micros = 5;
531
532  // Whether the receiver should send a MarkRecvFinishedRequest to the sender
533  // to ack the message.
534  bool require_ack = 6;
535}
536
537////////////////////////////////////////////////////////////////////////////////
538//
539// Collective Op dynamic group resolution messages.
540//
541////////////////////////////////////////////////////////////////////////////////
542
543// Supplies one or more device names as members of the group identified by
544// group_key.  Service will respond when all group_size devices become known.
545// All devices in group must have same type.
546message CompleteGroupRequest {
547  int32 group_key = 1;
548  int32 group_size = 2;
549  string device_type = 3;
550  int32 collective_type = 5;
551  DeviceAttributes device_attributes = 6;
552
553  reserved 4;
554}
555
556// Gives the complete membership of the group identified by group_key.
557message CompleteGroupResponse {
558  int32 group_key = 1;
559  int32 group_size = 2;
560  string device_type = 3;
561  int32 num_tasks = 4;  // number of distinct tasks hosting the devices
562  bytes communicator_key = 7;
563  repeated DeviceAttributes device_attributes = 8;
564
565  reserved 5, 6;
566}
567
568// Supplies data about one collective op belonging to the instance identified
569// by instance_key.  Service will respond when all group_size ops have
570// become known.  Most of the data being sent is for correctness checking,
571// to ensure that all ops in the instance share common attributes.
572message CompleteInstanceRequest {
573  string name = 1;
574  int32 type = 2;
575  DataType data_type = 3;
576  TensorShapeProto shape = 4;
577  int32 group_key = 5;
578  int32 group_size = 6;
579  int32 instance_key = 7;
580  string device_type = 8;
581  repeated int32 subdiv_offset = 9;
582  string device = 10;
583  bool is_source = 11;
584}
585
586// Confirms that every op in the instance has consistently declared itself.
587// Also gives the source_rank in case of broadcast.
588message CompleteInstanceResponse {
589  int32 instance_key = 1;
590  int32 source_rank = 2;
591  reserved 3;
592}
593
594// Request for next agreed-upon step_id for the specified graph_keys.
595// This is used to enable multiple graphs containing nodes from
596// a common collective instance to coordinate using the same step_ids.
597message GetStepSequenceRequest {
598  repeated int64 graph_key = 1;
599}
600
601message StepSequence {
602  int64 graph_key = 1;
603  int64 next_step_id = 2;
604}
605
606// Next valid step_ids for one or more graph_keys.
607message GetStepSequenceResponse {
608  repeated StepSequence step_sequence = 1;
609}
610