• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16syntax = "proto3";
17
18package tensorflow;
19
20option cc_enable_arenas = true;
21option java_outer_classname = "WorkerProtos";
22option java_multiple_files = true;
23option java_package = "org.tensorflow.distruntime";
24
25option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
26import "google/protobuf/any.proto";
27import "tensorflow/core/framework/cost_graph.proto";
28import "tensorflow/core/framework/device_attributes.proto";
29import "tensorflow/core/framework/graph.proto";
30import "tensorflow/core/framework/step_stats.proto";
31import "tensorflow/core/framework/tensor.proto";
32import "tensorflow/core/framework/tensor_shape.proto";
33import "tensorflow/core/framework/types.proto";
34import "tensorflow/core/protobuf/config.proto";
35import "tensorflow/core/protobuf/debug.proto";
36import "tensorflow/core/protobuf/error_codes.proto";
37import "tensorflow/core/protobuf/named_tensor.proto";
38import "tensorflow/core/protobuf/tensorflow_server.proto";
39
40////////////////////////////////////////////////////////////////////////////////
41//
42// GetStatus method request/response messages
43//
44////////////////////////////////////////////////////////////////////////////////
45
46message GetStatusRequest {}
47
48message GetStatusResponse {
49  repeated DeviceAttributes device_attributes = 1;
50}
51
52////////////////////////////////////////////////////////////////////////////////
53//
54// CreateSession method request/response messages
55//
56// For each session,
57//
58////////////////////////////////////////////////////////////////////////////////
59
60message CreateWorkerSessionRequest {
61  // Sessions are identified by a given handle.
62  string session_handle = 1;
63
64  // Defines the configuration of a TensorFlow worker.
65  ServerDef server_def = 2;
66
67  // If true, any resources such as Variables used in the session will not be
68  // shared with other sessions.
69  bool isolate_session_state = 3;
70
71  // The device attributes of all the devices in the cluster.
72  repeated DeviceAttributes cluster_device_attributes = 4;
73}
74
75message CreateWorkerSessionResponse {}
76
77////////////////////////////////////////////////////////////////////////////////
78//
79// DeleteSession method request/response messages
80//
81// Deletes all worker-side state associated with the given session handle.
82//
83////////////////////////////////////////////////////////////////////////////////
84
85message DeleteWorkerSessionRequest {
86  // Sessions are identified by a given handle.
87  string session_handle = 1;
88}
89
90message DeleteWorkerSessionResponse {}
91
92////////////////////////////////////////////////////////////////////////////////
93//
94// RegisterGraph method request/response messages
95//
96// For each session, after the master placed every node on a device,
97// it partitions the whole graph into many subgraphs. All the nodes in
98// a subgraph were in the same worker, but potentially on many devices
99// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The
100// master registers subgraphs for a worker before running any steps. A
101// successful registration returns a graph handle to be used in latter
102// RunGraph requests.
103//
104////////////////////////////////////////////////////////////////////////////////
105
106message RegisterGraphRequest {
107  // Subgraphs are scoped within one session.
108  string session_handle = 1;
109
110  // Set to true if `CreateWorkerSession` was called for `session_handle`.
111  bool create_worker_session_called = 6;
112
113  // "graph_def" has the subgraph of nodes for this worker, with each node
114  // having its device_name filled in.
115  GraphDef graph_def = 2;
116
117  // True iff the graph (before partitioning) contains control flow nodes.
118  //
119  // As of 01/11/2015, this is no longer set by clients.
120  bool has_control_flow = 3 [deprecated = true];
121
122  // Configuration options for the session in which this graph was created.
123  GraphOptions graph_options = 4;
124
125  // Field(s) used by TensorFlow Debugger (tfdbg).
126  DebugOptions debug_options = 5;
127
128  // If graph_def contains any collective ops this must be a positive
129  // integer used to coordinate execution with other graphs.  All
130  // graphs in a distributed execution with the same
131  // collective_graph_key will coordinate to use the same step_id
132  // concurrently so that BufRendezvous entries will make the correct
133  // values accessible.
134  int64 collective_graph_key = 7;
135
136  // ConfigProto from the session in which this graph was created.
137  // Contains additional parameters beyond graph_options, including
138  // the name of the requested executor.
139  ConfigProto config_proto = 8;
140}
141
142message RegisterGraphResponse {
143  // If the registration succeeds, returns an opaque graph_handle to
144  // the master. The master calls RunGraph with graph_handle to
145  // compute different steps.
146  string graph_handle = 1;
147}
148
149////////////////////////////////////////////////////////////////////////////////
150//
151// DeregisterGraph method request/response messages
152//
153// The master deregisters the given graph_handle when the graph is no
154// longer needed (e.g., the overall graph is re-scheduled and nodes
155// are re-placed).
156//
157// The worker deregisters a graph_handle automatically according to on
158// a TTL-base policy in case of master restarts.
159//
160////////////////////////////////////////////////////////////////////////////////
161
162message DeregisterGraphRequest {
163  // The session_handle used when registering the graph. If session_handle is
164  // empty, a single global namespace is used.
165  string session_handle = 2;
166
167  // Set to true if `CreateWorkerSession` was called for `session_handle`.
168  bool create_worker_session_called = 3;
169
170  // REQUIRED: graph_handle must be returned by a RegisterGraph call
171  // to the same WorkerService.
172  string graph_handle = 1;
173}
174
175message DeregisterGraphResponse {
176  // TODO(mrry): Optionally add summary stats for the graph.
177}
178
179////////////////////////////////////////////////////////////////////////////////
180//
181// CleanupAll method request/response messages
182//
183////////////////////////////////////////////////////////////////////////////////
184
185message CleanupAllRequest {
186  // A list of container names.
187  //
188  // If 'container' is not empty, releases resources in the given
189  // containers in all devices.
190  //
191  // If 'container' is empty, releases resources in the default
192  // container in all devices.
193  repeated string container = 1;
194}
195
196message CleanupAllResponse {}
197
198////////////////////////////////////////////////////////////////////////////////
199//
200// RunGraph request / response messages
201//
202// The worker executes all subgraphs registered under graph_handle.
203// RunGraph returns after the execution finishes or an error is
204// encountered.
205// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for
206// partial graph execution.
207//
208////////////////////////////////////////////////////////////////////////////////
209
210// Options specific to the execution of a single step.
211message ExecutorOpts {
212  bool record_costs = 1;
213  bool record_timeline = 3;
214  bool record_partition_graphs = 4;
215  bool report_tensor_allocations_upon_oom = 5;
216}
217
218message RunGraphRequest {
219  // session_handle is the master-generated unique id for this session.
220  // If session_handle is non-empty, it must be the same as used when
221  // registering the graph. If it is empty, a single global namespace is used to
222  // search for the graph_handle.
223  string session_handle = 8;
224
225  // Set to true if `CreateWorkerSession` was called for `session_handle`.
226  bool create_worker_session_called = 10;
227
228  // REQUIRED: graph_handle must be returned by a RegisterGraph call
229  // to the same WorkerService.
230  string graph_handle = 1;
231
232  // A unique ID to distinguish different runs of the same graph.
233  //
234  // The master generates a global unique `step_id` to distinguish
235  // different runs of the graph computation. Subgraphs communicate
236  // (e.g., send/recv ops) with each other using `step_id` to
237  // distinguish tensors generated by different runs.
238  int64 step_id = 2;
239
240  // Options for this step.
241  ExecutorOpts exec_opts = 5;
242
243  // Runs the graph.
244  //
245  // Sends the tensors in "send" into the graph before the run and
246  // fetches the keys into `RunGraphResponse.recv` after the run.
247  repeated NamedTensorProto send = 3;
248  repeated string recv_key = 4;
249
250  // True if the RunGraphRequest is a partial run request.
251  bool is_partial = 6;
252  // True if this is the last partial run request in a sequence of requests.
253  bool is_last_partial_run = 7;
254
255  // If true then some errors, e.g., execution errors that have long
256  // error messages, may return an OK RunGraphResponse with the actual
257  // error saved in the status_code/status_error_message fields of the
258  // response body. This is a workaround since the RPC subsystem may
259  // truncate long metadata messages.
260  bool store_errors_in_response_body = 9;
261
262  // Unique identifier for this request. Every RunGraphRequest must have a
263  // unique request_id, and retried RunGraphRequests must have the same
264  // request_id. If request_id is zero, retry detection is disabled.
265  //
266  // Retried RunGraphRequests are problematic because they may issue a
267  // RecvTensor that will have no corresponding sender and will wait forever.
268  // Workers use request_ids to reject retried RunGraph requests instead of
269  // waiting forever.
270  int64 request_id = 11;
271
272  // Next: 12
273}
274
275message RunGraphResponse {
276  // A list of tensors corresponding to those requested by
277  // `RunGraphRequest.recv_key`.
278  repeated NamedTensorProto recv = 1;
279
280  // If the request asked for execution stats, the cost graph, or the partition
281  // graphs, these are returned here.
282  // TODO(suharshs): Package these in a RunMetadata instead.
283  StepStats step_stats = 2;
284  CostGraphDef cost_graph = 3;
285  repeated GraphDef partition_graph = 4;
286
287  // If store_errors_in_response_body is true in the request, then
288  // optionally the server may return an OK status for the RPC and
289  // fill the true status into the fields below, to allow for messages
290  // that are too long to fit in metadata.
291  error.Code status_code = 5;
292  string status_error_message = 6;
293}
294
295////////////////////////////////////////////////////////////////////////////////
296//
297// CleanupGraph method request/response messages
298//
299// After the master receives RunGraph responses from all workers, the
300// master instructs every worker to cleanup any remaining state of a
301// step (e.g. tensors buffered by a `Send` op but not picked up by
302// other workers). The master does not necessarily need to wait for
303// completion of CleanupGraph calls.
304//
305// Workers should cleanup step states automatically according to a
306// TTL-based policy in case of master restarts.
307//
308////////////////////////////////////////////////////////////////////////////////
309
310message CleanupGraphRequest {
311  int64 step_id = 1;
312}
313
314message CleanupGraphResponse {}
315
316////////////////////////////////////////////////////////////////////////////////
317//
318// RecvTensor method request/response messages
319//
320////////////////////////////////////////////////////////////////////////////////
321
322message RecvTensorRequest {
323  // The step in which the tensor will be produced.
324  //
325  // REQUIRED: This must eventually correspond to the `step_id` passed
326  // into a RunGraph call on the same WorkerService.
327  int64 step_id = 1;
328
329  // A key identifying the channel to receive tensors from. A RecvTensor request
330  // retrieves one tensor from the channel, but multiple tensors can be sent and
331  // received over the same channel with multiple RecvTensor requests. See
332  // rendezvous.h for details.
333  string rendezvous_key = 2;
334
335  // If true, use an out-of-band DMA mechanism to transfer the
336  // received tensor.
337  bool dma_ok = 3;
338
339  // Optional information on client-side device locality.
340  DeviceLocality client_locality = 4;
341
342  // Optional information on server-side device locality.
343  DeviceLocality server_locality = 5;
344
345  // Optional information needed by the RPC subsystem.
346  google.protobuf.Any transport_options = 6;
347
348  // Unique identifier for this request. Every RecvTensorRequest must have a
349  // unique request_id, and retried RecvTensorRequests must have the same
350  // request_id. If request_id is zero, retry detection and response cache
351  // are disabled.
352  //
353  // Retried RecvTensorRequests are problematic because a RecvTensor with no
354  // corresponding sender will wait forever, and the tensor may have been
355  // delivered to a previous retry. Workers use request_ids to reject retried
356  // RecvTensor requests instead of waiting forever.
357  int64 request_id = 7;
358}
359
360message RecvTensorResponse {
361  // The tensor as a proto.
362  TensorProto tensor = 1;
363
364  // If true, this tensor was the output of a dead node, and the
365  // content is invalid.
366  bool is_dead = 2;
367
368  // The time at which tensor was available and started to be returned.
369  int64 send_start_micros = 3;
370
371  // Optional additional information about how to receive the tensor,
372  // e.g. in the event that `RecvTensorRequest.dma_ok` was true.
373  google.protobuf.Any transport_options = 4;
374
375  // Whether the receiver should send a MarkRecvFinishedRequest to the sender
376  // to ack the message.
377  bool require_ack = 5;
378}
379
380// Message for managing the response cache maintained on the sender side.
381// Currently only used by the gRPC worker service.
382message MarkRecvFinishedRequest {
383  int64 request_id = 1;
384}
385
386message MarkRecvFinishedResponse {}
387
388////////////////////////////////////////////////////////////////////////////////
389//
390// Logging method request/response messages
391//
392// NOTE(mrry): This feature is not supported in the open-source
393// version, and these messages are expected to change.
394//
395////////////////////////////////////////////////////////////////////////////////
396
397// Out-of-band request to begin or end logging, or
398// to retrieve logs for particular steps.
399message LoggingRequest {
400  // If true, RPC logging will be enabled.
401  bool enable_rpc_logging = 1;
402
403  // If true, RPC logging will be disabled.
404  bool disable_rpc_logging = 4;
405
406  // If true, discard any saved logging data (for all steps).
407  bool clear = 2;
408
409  // When set, requests all saved log data pertaining to the step.
410  // Any log data retrieved is eliminated from the store and cannot be
411  // retrieved again.
412  repeated int64 fetch_step_id = 3;
413}
414
415message LabeledStepStats {
416  int64 step_id = 1;
417  StepStats step_stats = 2;
418}
419
420message LoggingResponse {
421  repeated LabeledStepStats step = 1;
422}
423
424////////////////////////////////////////////////////////////////////////////////
425//
426// Tracing method request/response messages
427//
428// NOTE(mrry): This feature is not supported in the open-source
429// version, and these messages are expected to change.
430//
431////////////////////////////////////////////////////////////////////////////////
432
433message TraceOpts {
434  // Length of the trace to be taken, in seconds.
435  double duration = 1;
436  // If true, capture step profile locally in each worker. Currently
437  // unimplemented.
438  bool use_step_profiler = 2;
439  // If true, capture kernel events from each worker.
440  bool use_kernel_profiler = 3;
441  // If true, capture extended profiling events from TensorFlow process.
442  bool use_extended_profiler = 4;
443  // If true, capture GPU profiling events locally on each
444  // machine. Currently unimplemented.
445  bool use_gpu_profiler = 5;
446  // If true, collect sampled profile events. Currently unimplemented.
447  bool use_sample_profiler = 6;
448}
449
450// Out-of-band request to configure distributed tracing.
451message TracingRequest {
452  TraceOpts options = 1;
453}
454
455message TracingResponse {}
456
457////////////////////////////////////////////////////////////////////////////////
458//
459// Raw data transfers in support of Collective Ops.
460// These methods are experimental and subject to change.
461//
462// The intention is to allow collectives to take advantage of the most
463// efficient methods available on a platform, e.g. RDMA, and not be
464// constrained to use the RPC system in use by other methods.
465//
466////////////////////////////////////////////////////////////////////////////////
467
468message RecvBufRequest {
469  // Use of the fields below may vary by implementation.  For example
470  // the buf_ptr and num_bytes may be set only for local operations and
471  // not sent on the wire, or only sent on the wire in one direction.
472
473  // Used at server side to find the correct BufRendezvous.
474  int64 step_id = 1;
475
476  // Arbitrary string identifying a BufRendezvous entry.
477  string buf_rendezvous_key = 2;
478
479  // Size of value expected, must agree with BufRendezvous entry.
480  int64 num_bytes = 3;
481
482  // When RDMA is in use, address of destination field on client.
483  fixed64 buf_ptr = 4;
484
485  // Optional information on client-side device locality.
486  DeviceLocality client_locality = 5;
487
488  // Optional information on server-side device locality.
489  DeviceLocality server_locality = 6;
490
491  // Optional, implementation-specific data.
492  google.protobuf.Any transport_options = 7;
493  // For annotating timeline and device incarnation check.
494  string src_device = 8;
495  // Optional, for annotating the timeline.
496  string dst_device = 9;
497
498  // Depending on the RPC system in use, it may be necessary to set this
499  // id to detect resends of RPCs where the server is not aware that
500  // the prior RPC failed.
501  int64 request_id = 10;
502
503  // Incarnation number of the source device, used to detect worker failures.
504  uint64 src_incarnation = 11;
505}
506
507message RecvBufResponse {
508  // Use of the fields below may vary by implementation.  Comments give
509  // intended use.
510
511  fixed64 buf_ptr = 1;  // Address of source field on server.
512  int64 num_bytes = 2;  // Byte length of buf_ptr field, if set.
513  bool is_dead = 3;     // True if value is 'dead' like a tensor.
514  // Optional, implementation-specific data.
515  google.protobuf.Any transport_options = 4;
516  // Optional, for timeline.
517  int64 send_start_micros = 5;
518
519  // Whether the receiver should send a MarkRecvFinishedRequest to the sender
520  // to ack the message.
521  bool require_ack = 6;
522}
523
524////////////////////////////////////////////////////////////////////////////////
525//
526// Collective Op dynamic group resolution messages.
527//
528////////////////////////////////////////////////////////////////////////////////
529
530// Supplies one or more device names as members of the group identified by
531// group_key.  Service will respond when all group_size devices become known.
532// All devices in group must have same type.
533message CompleteGroupRequest {
534  int32 group_key = 1;
535  int32 group_size = 2;
536  string device_type = 3;
537  repeated string device_name = 4;
538  int32 collective_type = 5;
539}
540
541// Gives the complete membership of the group identified by group_key.
542message CompleteGroupResponse {
543  int32 group_key = 1;
544  int32 group_size = 2;
545  string device_type = 3;
546  int32 num_tasks = 4;  // number of distinct tasks hosting the devices
547  repeated string device_name = 5;
548  repeated string task_name = 6;  // task name prefixes of device_names
549  bytes communicator_key = 7;
550}
551
552// Supplies data about one collective op belonging to the instance identified
553// by instance_key.  Service will respond when all group_size ops have
554// become known.  Most of the data being sent is for correctness checking,
555// to ensure that all ops in the instance share common attributes.
556message CompleteInstanceRequest {
557  string name = 1;
558  int32 type = 2;
559  DataType data_type = 3;
560  TensorShapeProto shape = 4;
561  int32 group_key = 5;
562  int32 group_size = 6;
563  int32 instance_key = 7;
564  string device_type = 8;
565  repeated int32 subdiv_offset = 9;
566  string device = 10;
567  bool is_source = 11;
568}
569
570// Confirms that every op in the instance has consistently declared itself.
571// Also gives the source_rank in case of broadcast.
572message CompleteInstanceResponse {
573  int32 instance_key = 1;
574  int32 source_rank = 2;
575  reserved 3;
576}
577
578// Request for next agreed-upon step_id for the specified graph_keys.
579// This is used to enable multiple graphs containing nodes from
580// a common collective instance to coordinate using the same step_ids.
581message GetStepSequenceRequest {
582  repeated int64 graph_key = 1;
583}
584
585message StepSequence {
586  int64 graph_key = 1;
587  int64 next_step_id = 2;
588}
589
590// Next valid step_ids for one or more graph_keys.
591message GetStepSequenceResponse {
592  repeated StepSequence step_sequence = 1;
593}
594