1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16syntax = "proto3"; 17 18package tensorflow; 19 20import "google/protobuf/any.proto"; 21import "tensorflow/core/framework/cost_graph.proto"; 22import "tensorflow/core/framework/device_attributes.proto"; 23import "tensorflow/core/framework/graph.proto"; 24import "tensorflow/core/framework/step_stats.proto"; 25import "tensorflow/core/framework/tensor.proto"; 26import "tensorflow/core/framework/tensor_shape.proto"; 27import "tensorflow/core/framework/types.proto"; 28import "tensorflow/core/protobuf/config.proto"; 29import "tensorflow/core/protobuf/debug.proto"; 30import "tensorflow/core/protobuf/error_codes.proto"; 31import "tensorflow/core/protobuf/named_tensor.proto"; 32import "tensorflow/core/protobuf/tensorflow_server.proto"; 33 34option cc_enable_arenas = true; 35option java_outer_classname = "WorkerProtos"; 36option java_multiple_files = true; 37option java_package = "org.tensorflow.distruntime"; 38option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; 39 40//////////////////////////////////////////////////////////////////////////////// 41// 42// GetStatus method request/response messages 43// 44//////////////////////////////////////////////////////////////////////////////// 45 46message GetStatusRequest {} 47 48message GetStatusResponse { 49 repeated DeviceAttributes device_attributes = 1; 50} 51 52//////////////////////////////////////////////////////////////////////////////// 53// 54// CreateSession method request/response messages 55// 56// For each session, 57// 58//////////////////////////////////////////////////////////////////////////////// 59 60message CreateWorkerSessionRequest { 61 // Sessions are identified by a given handle. 62 string session_handle = 1; 63 64 // Defines the configuration of a TensorFlow worker. 65 ServerDef server_def = 2; 66 67 // If true, any resources such as Variables used in the session will not be 68 // shared with other sessions. 69 bool isolate_session_state = 3; 70 71 // The device attributes of all the devices in the cluster. 72 repeated DeviceAttributes cluster_device_attributes = 4; 73 74 // The master task name from which the request is sent. 75 string master_task = 5; 76 77 // The incarnation ID of the master task local CPU device. 78 // If the target worker already has a WorkerSession created previously with 79 // the same master task name but a different incarnation, it usually indicates 80 // that the previous master failed before deleting the WorkerSession on the 81 // worker. To prevent memory leaks, the worker should garbage collect the old 82 // WorkerSessions. 83 int64 master_incarnation = 6; 84 85 reserved 7; // Deprecated config that is embedded within server_def now. 86} 87 88message CreateWorkerSessionResponse {} 89 90//////////////////////////////////////////////////////////////////////////////// 91// 92// DeleteSession method request/response messages 93// 94// Deletes all worker-side state associated with the given session handle. 95// 96//////////////////////////////////////////////////////////////////////////////// 97 98message DeleteWorkerSessionRequest { 99 // Sessions are identified by a given handle. 100 string session_handle = 1; 101} 102 103message DeleteWorkerSessionResponse {} 104 105//////////////////////////////////////////////////////////////////////////////// 106// 107// RegisterGraph method request/response messages 108// 109// For each session, after the master placed every node on a device, 110// it partitions the whole graph into many subgraphs. All the nodes in 111// a subgraph were in the same worker, but potentially on many devices 112// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The 113// master registers subgraphs for a worker before running any steps. A 114// successful registration returns a graph handle to be used in latter 115// RunGraph requests. 116// 117//////////////////////////////////////////////////////////////////////////////// 118 119message RegisterGraphRequest { 120 // Subgraphs are scoped within one session. 121 string session_handle = 1; 122 123 // Set to true if `CreateWorkerSession` was called for `session_handle`. 124 bool create_worker_session_called = 6; 125 126 // "graph_def" has the subgraph of nodes for this worker, with each node 127 // having its device_name filled in. 128 GraphDef graph_def = 2; 129 130 // True iff the graph (before partitioning) contains control flow nodes. 131 // 132 // As of 01/11/2015, this is no longer set by clients. 133 bool has_control_flow = 3 [deprecated = true]; 134 135 // Configuration options for the session in which this graph was created. 136 GraphOptions graph_options = 4; 137 138 // Field(s) used by TensorFlow Debugger (tfdbg). 139 DebugOptions debug_options = 5; 140 141 // If graph_def contains any collective ops this must be a positive 142 // integer used to coordinate execution with other graphs. All 143 // graphs in a distributed execution with the same 144 // collective_graph_key will coordinate to use the same step_id 145 // concurrently so that BufRendezvous entries will make the correct 146 // values accessible. 147 int64 collective_graph_key = 7; 148 149 // ConfigProto from the session in which this graph was created. 150 // Contains additional parameters beyond graph_options, including 151 // the name of the requested executor. 152 ConfigProto config_proto = 8; 153} 154 155message RegisterGraphResponse { 156 // If the registration succeeds, returns an opaque graph_handle to 157 // the master. The master calls RunGraph with graph_handle to 158 // compute different steps. 159 string graph_handle = 1; 160} 161 162//////////////////////////////////////////////////////////////////////////////// 163// 164// DeregisterGraph method request/response messages 165// 166// The master deregisters the given graph_handle when the graph is no 167// longer needed (e.g., the overall graph is re-scheduled and nodes 168// are re-placed). 169// 170// The worker deregisters a graph_handle automatically according to on 171// a TTL-base policy in case of master restarts. 172// 173//////////////////////////////////////////////////////////////////////////////// 174 175message DeregisterGraphRequest { 176 // The session_handle used when registering the graph. If session_handle is 177 // empty, a single global namespace is used. 178 string session_handle = 2; 179 180 // Set to true if `CreateWorkerSession` was called for `session_handle`. 181 bool create_worker_session_called = 3; 182 183 // REQUIRED: graph_handle must be returned by a RegisterGraph call 184 // to the same WorkerService. 185 string graph_handle = 1; 186} 187 188message DeregisterGraphResponse { 189 // TODO(mrry): Optionally add summary stats for the graph. 190} 191 192//////////////////////////////////////////////////////////////////////////////// 193// 194// CleanupAll method request/response messages 195// 196//////////////////////////////////////////////////////////////////////////////// 197 198message CleanupAllRequest { 199 // A list of container names. 200 // 201 // If 'container' is not empty, releases resources in the given 202 // containers in all devices. 203 // 204 // If 'container' is empty, releases resources in the default 205 // container in all devices. 206 repeated string container = 1; 207} 208 209message CleanupAllResponse {} 210 211//////////////////////////////////////////////////////////////////////////////// 212// 213// RunGraph request / response messages 214// 215// The worker executes all subgraphs registered under graph_handle. 216// RunGraph returns after the execution finishes or an error is 217// encountered. 218// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for 219// partial graph execution. 220// 221//////////////////////////////////////////////////////////////////////////////// 222 223// Options specific to the execution of a single step. 224message ExecutorOpts { 225 bool record_costs = 1; 226 bool record_timeline = 3; 227 bool record_partition_graphs = 4; 228 bool report_tensor_allocations_upon_oom = 5; 229} 230 231message RunGraphRequest { 232 // session_handle is the master-generated unique id for this session. 233 // If session_handle is non-empty, it must be the same as used when 234 // registering the graph. If it is empty, a single global namespace is used to 235 // search for the graph_handle. 236 string session_handle = 8; 237 238 // Set to true if `CreateWorkerSession` was called for `session_handle`. 239 bool create_worker_session_called = 10; 240 241 // REQUIRED: graph_handle must be returned by a RegisterGraph call 242 // to the same WorkerService. 243 string graph_handle = 1; 244 245 // A unique ID to distinguish different runs of the same graph. 246 // 247 // The master generates a global unique `step_id` to distinguish 248 // different runs of the graph computation. Subgraphs communicate 249 // (e.g., send/recv ops) with each other using `step_id` to 250 // distinguish tensors generated by different runs. 251 int64 step_id = 2; 252 253 // Options for this step. 254 ExecutorOpts exec_opts = 5; 255 256 // Runs the graph. 257 // 258 // Sends the tensors in "send" into the graph before the run and 259 // fetches the keys into `RunGraphResponse.recv` after the run. 260 repeated NamedTensorProto send = 3; 261 repeated string recv_key = 4; 262 263 // True if the RunGraphRequest is a partial run request. 264 bool is_partial = 6; 265 // True if this is the last partial run request in a sequence of requests. 266 bool is_last_partial_run = 7; 267 268 // If true then some errors, e.g., execution errors that have long 269 // error messages, may return an OK RunGraphResponse with the actual 270 // error saved in the status_code/status_error_message fields of the 271 // response body. This is a workaround since the RPC subsystem may 272 // truncate long metadata messages. 273 bool store_errors_in_response_body = 9; 274 275 // Unique identifier for this request. Every RunGraphRequest must have a 276 // unique request_id, and retried RunGraphRequests must have the same 277 // request_id. If request_id is zero, retry detection is disabled. 278 // 279 // Retried RunGraphRequests are problematic because they may issue a 280 // RecvTensor that will have no corresponding sender and will wait forever. 281 // Workers use request_ids to reject retried RunGraph requests instead of 282 // waiting forever. 283 int64 request_id = 11; 284 285 // Next: 12 286} 287 288message RunGraphResponse { 289 // A list of tensors corresponding to those requested by 290 // `RunGraphRequest.recv_key`. 291 repeated NamedTensorProto recv = 1; 292 293 // If the request asked for execution stats, the cost graph, or the partition 294 // graphs, these are returned here. 295 // TODO(suharshs): Package these in a RunMetadata instead. 296 StepStats step_stats = 2; 297 CostGraphDef cost_graph = 3; 298 repeated GraphDef partition_graph = 4; 299 300 // If store_errors_in_response_body is true in the request, then 301 // optionally the server may return an OK status for the RPC and 302 // fill the true status into the fields below, to allow for messages 303 // that are too long to fit in metadata. 304 error.Code status_code = 5; 305 string status_error_message = 6; 306} 307 308//////////////////////////////////////////////////////////////////////////////// 309// 310// CleanupGraph method request/response messages 311// 312// After the master receives RunGraph responses from all workers, the 313// master instructs every worker to cleanup any remaining state of a 314// step (e.g. tensors buffered by a `Send` op but not picked up by 315// other workers). The master does not necessarily need to wait for 316// completion of CleanupGraph calls. 317// 318// Workers should cleanup step states automatically according to a 319// TTL-based policy in case of master restarts. 320// 321//////////////////////////////////////////////////////////////////////////////// 322 323message CleanupGraphRequest { 324 int64 step_id = 1; 325} 326 327message CleanupGraphResponse {} 328 329//////////////////////////////////////////////////////////////////////////////// 330// 331// RecvTensor method request/response messages 332// 333//////////////////////////////////////////////////////////////////////////////// 334 335message RecvTensorRequest { 336 // The step in which the tensor will be produced. 337 // 338 // REQUIRED: This must eventually correspond to the `step_id` passed 339 // into a RunGraph call on the same WorkerService. 340 int64 step_id = 1; 341 342 // A key identifying the channel to receive tensors from. A RecvTensor request 343 // retrieves one tensor from the channel, but multiple tensors can be sent and 344 // received over the same channel with multiple RecvTensor requests. See 345 // rendezvous.h for details. 346 string rendezvous_key = 2; 347 348 // If true, use an out-of-band DMA mechanism to transfer the 349 // received tensor. 350 bool dma_ok = 3; 351 352 // Optional information on client-side device locality. 353 DeviceLocality client_locality = 4; 354 355 // Optional information on server-side device locality. 356 DeviceLocality server_locality = 5; 357 358 // Optional information needed by the RPC subsystem. 359 google.protobuf.Any transport_options = 6; 360 361 // Unique identifier for this request. Every RecvTensorRequest must have a 362 // unique request_id, and retried RecvTensorRequests must have the same 363 // request_id. If request_id is zero, retry detection and response cache 364 // are disabled. 365 // 366 // Retried RecvTensorRequests are problematic because a RecvTensor with no 367 // corresponding sender will wait forever, and the tensor may have been 368 // delivered to a previous retry. Workers use request_ids to reject retried 369 // RecvTensor requests instead of waiting forever. 370 int64 request_id = 7; 371} 372 373message RecvTensorResponse { 374 // The tensor as a proto. 375 TensorProto tensor = 1; 376 377 // If true, this tensor was the output of a dead node, and the 378 // content is invalid. 379 bool is_dead = 2; 380 381 // The time at which tensor was available and started to be returned. 382 int64 send_start_micros = 3; 383 384 // Optional additional information about how to receive the tensor, 385 // e.g. in the event that `RecvTensorRequest.dma_ok` was true. 386 google.protobuf.Any transport_options = 4; 387 388 // Whether the receiver should send a MarkRecvFinishedRequest to the sender 389 // to ack the message. 390 bool require_ack = 5; 391} 392 393// Message for managing the response cache maintained on the sender side. 394// Currently only used by the gRPC worker service. 395message MarkRecvFinishedRequest { 396 int64 request_id = 1; 397} 398 399message MarkRecvFinishedResponse {} 400 401//////////////////////////////////////////////////////////////////////////////// 402// 403// Logging method request/response messages 404// 405// NOTE(mrry): This feature is not supported in the open-source 406// version, and these messages are expected to change. 407// 408//////////////////////////////////////////////////////////////////////////////// 409 410// Out-of-band request to begin or end logging, or 411// to retrieve logs for particular steps. 412message LoggingRequest { 413 // If true, RPC logging will be enabled. 414 bool enable_rpc_logging = 1; 415 416 // If true, RPC logging will be disabled. 417 bool disable_rpc_logging = 4; 418 419 // If true, discard any saved logging data (for all steps). 420 bool clear = 2; 421 422 // When set, requests all saved log data pertaining to the step. 423 // Any log data retrieved is eliminated from the store and cannot be 424 // retrieved again. 425 repeated int64 fetch_step_id = 3; 426} 427 428message LabeledStepStats { 429 int64 step_id = 1; 430 StepStats step_stats = 2; 431} 432 433message LoggingResponse { 434 repeated LabeledStepStats step = 1; 435} 436 437//////////////////////////////////////////////////////////////////////////////// 438// 439// Tracing method request/response messages 440// 441// NOTE(mrry): This feature is not supported in the open-source 442// version, and these messages are expected to change. 443// 444//////////////////////////////////////////////////////////////////////////////// 445 446message TraceOpts { 447 // Length of the trace to be taken, in seconds. 448 double duration = 1; 449 // If true, capture step profile locally in each worker. Currently 450 // unimplemented. 451 bool use_step_profiler = 2; 452 // If true, capture kernel events from each worker. 453 bool use_kernel_profiler = 3; 454 // If true, capture extended profiling events from TensorFlow process. 455 bool use_extended_profiler = 4; 456 // If true, capture GPU profiling events locally on each 457 // machine. Currently unimplemented. 458 bool use_gpu_profiler = 5; 459 // If true, collect sampled profile events. Currently unimplemented. 460 bool use_sample_profiler = 6; 461} 462 463// Out-of-band request to configure distributed tracing. 464message TracingRequest { 465 TraceOpts options = 1; 466} 467 468message TracingResponse {} 469 470//////////////////////////////////////////////////////////////////////////////// 471// 472// Raw data transfers in support of Collective Ops. 473// These methods are experimental and subject to change. 474// 475// The intention is to allow collectives to take advantage of the most 476// efficient methods available on a platform, e.g. RDMA, and not be 477// constrained to use the RPC system in use by other methods. 478// 479//////////////////////////////////////////////////////////////////////////////// 480 481message RecvBufRequest { 482 // Use of the fields below may vary by implementation. For example 483 // the buf_ptr and num_bytes may be set only for local operations and 484 // not sent on the wire, or only sent on the wire in one direction. 485 486 // Used at server side to find the correct BufRendezvous. 487 int64 step_id = 1; 488 489 // Arbitrary string identifying a BufRendezvous entry. 490 string buf_rendezvous_key = 2; 491 492 // Size of value expected, must agree with BufRendezvous entry. 493 int64 num_bytes = 3; 494 495 // When RDMA is in use, address of destination field on client. 496 fixed64 buf_ptr = 4; 497 498 // Optional information on client-side device locality. 499 DeviceLocality client_locality = 5; 500 501 // Optional information on server-side device locality. 502 DeviceLocality server_locality = 6; 503 504 // Optional, implementation-specific data. 505 google.protobuf.Any transport_options = 7; 506 // For annotating timeline and device incarnation check. 507 string src_device = 8; 508 // Optional, for annotating the timeline. 509 string dst_device = 9; 510 511 // Depending on the RPC system in use, it may be necessary to set this 512 // id to detect resends of RPCs where the server is not aware that 513 // the prior RPC failed. 514 int64 request_id = 10; 515 516 // Incarnation number of the source device, used to detect worker failures. 517 uint64 src_incarnation = 11; 518} 519 520message RecvBufResponse { 521 // Use of the fields below may vary by implementation. Comments give 522 // intended use. 523 524 fixed64 buf_ptr = 1; // Address of source field on server. 525 int64 num_bytes = 2; // Byte length of buf_ptr field, if set. 526 bool is_dead = 3; // True if value is 'dead' like a tensor. 527 // Optional, implementation-specific data. 528 google.protobuf.Any transport_options = 4; 529 // Optional, for timeline. 530 int64 send_start_micros = 5; 531 532 // Whether the receiver should send a MarkRecvFinishedRequest to the sender 533 // to ack the message. 534 bool require_ack = 6; 535} 536 537//////////////////////////////////////////////////////////////////////////////// 538// 539// Collective Op dynamic group resolution messages. 540// 541//////////////////////////////////////////////////////////////////////////////// 542 543// Supplies one or more device names as members of the group identified by 544// group_key. Service will respond when all group_size devices become known. 545// All devices in group must have same type. 546message CompleteGroupRequest { 547 int32 group_key = 1; 548 int32 group_size = 2; 549 string device_type = 3; 550 int32 collective_type = 5; 551 DeviceAttributes device_attributes = 6; 552 553 reserved 4; 554} 555 556// Gives the complete membership of the group identified by group_key. 557message CompleteGroupResponse { 558 int32 group_key = 1; 559 int32 group_size = 2; 560 string device_type = 3; 561 int32 num_tasks = 4; // number of distinct tasks hosting the devices 562 bytes communicator_key = 7; 563 repeated DeviceAttributes device_attributes = 8; 564 565 reserved 5, 6; 566} 567 568// Supplies data about one collective op belonging to the instance identified 569// by instance_key. Service will respond when all group_size ops have 570// become known. Most of the data being sent is for correctness checking, 571// to ensure that all ops in the instance share common attributes. 572message CompleteInstanceRequest { 573 string name = 1; 574 int32 type = 2; 575 DataType data_type = 3; 576 TensorShapeProto shape = 4; 577 int32 group_key = 5; 578 int32 group_size = 6; 579 int32 instance_key = 7; 580 string device_type = 8; 581 repeated int32 subdiv_offset = 9; 582 string device = 10; 583 bool is_source = 11; 584} 585 586// Confirms that every op in the instance has consistently declared itself. 587// Also gives the source_rank in case of broadcast. 588message CompleteInstanceResponse { 589 int32 instance_key = 1; 590 int32 source_rank = 2; 591 reserved 3; 592} 593 594// Request for next agreed-upon step_id for the specified graph_keys. 595// This is used to enable multiple graphs containing nodes from 596// a common collective instance to coordinate using the same step_ids. 597message GetStepSequenceRequest { 598 repeated int64 graph_key = 1; 599} 600 601message StepSequence { 602 int64 graph_key = 1; 603 int64 next_step_id = 2; 604} 605 606// Next valid step_ids for one or more graph_keys. 607message GetStepSequenceResponse { 608 repeated StepSequence step_sequence = 1; 609} 610