1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16syntax = "proto3"; 17 18package tensorflow; 19 20import "google/protobuf/any.proto"; 21import "tensorflow/core/framework/cost_graph.proto"; 22import "tensorflow/core/framework/device_attributes.proto"; 23import "tensorflow/core/framework/graph.proto"; 24import "tensorflow/core/framework/step_stats.proto"; 25import "tensorflow/core/framework/tensor.proto"; 26import "tensorflow/core/framework/tensor_shape.proto"; 27import "tensorflow/core/framework/types.proto"; 28import "tensorflow/core/protobuf/config.proto"; 29import "tensorflow/core/protobuf/debug.proto"; 30import "tensorflow/core/protobuf/error_codes.proto"; 31import "tensorflow/core/protobuf/named_tensor.proto"; 32import "tensorflow/core/protobuf/tensorflow_server.proto"; 33 34option cc_enable_arenas = true; 35option java_outer_classname = "WorkerProtos"; 36option java_multiple_files = true; 37option java_package = "org.tensorflow.distruntime"; 38option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; 39 40//////////////////////////////////////////////////////////////////////////////// 41// 42// GetStatus method request/response messages 43// 44//////////////////////////////////////////////////////////////////////////////// 45 46message GetStatusRequest {} 47 48message GetStatusResponse { 49 repeated DeviceAttributes device_attributes = 1; 50} 51 52//////////////////////////////////////////////////////////////////////////////// 53// 54// CreateSession method request/response messages 55// 56// For each session, 57// 58//////////////////////////////////////////////////////////////////////////////// 59 60message CreateWorkerSessionRequest { 61 // Sessions are identified by a given handle. 62 string session_handle = 1; 63 64 // Defines the configuration of a TensorFlow worker. 65 ServerDef server_def = 2; 66 67 // If true, any resources such as Variables used in the session will not be 68 // shared with other sessions. 69 bool isolate_session_state = 3; 70 71 // The device attributes of all the devices in the cluster. 72 repeated DeviceAttributes cluster_device_attributes = 4; 73 74 // The master task name from which the request is sent. 75 string master_task = 5; 76 77 // The incarnation ID of the master task local CPU device. 78 // If the target worker already has a WorkerSession created previously with 79 // the same master task name but a different incarnation, it usually indicates 80 // that the previous master failed before deleting the WorkerSession on the 81 // worker. To prevent memory leaks, the worker should garbage collect the old 82 // WorkerSessions. 83 int64 master_incarnation = 6; 84} 85 86message CreateWorkerSessionResponse {} 87 88//////////////////////////////////////////////////////////////////////////////// 89// 90// DeleteSession method request/response messages 91// 92// Deletes all worker-side state associated with the given session handle. 93// 94//////////////////////////////////////////////////////////////////////////////// 95 96message DeleteWorkerSessionRequest { 97 // Sessions are identified by a given handle. 98 string session_handle = 1; 99} 100 101message DeleteWorkerSessionResponse {} 102 103//////////////////////////////////////////////////////////////////////////////// 104// 105// RegisterGraph method request/response messages 106// 107// For each session, after the master placed every node on a device, 108// it partitions the whole graph into many subgraphs. All the nodes in 109// a subgraph were in the same worker, but potentially on many devices 110// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The 111// master registers subgraphs for a worker before running any steps. A 112// successful registration returns a graph handle to be used in latter 113// RunGraph requests. 114// 115//////////////////////////////////////////////////////////////////////////////// 116 117message RegisterGraphRequest { 118 // Subgraphs are scoped within one session. 119 string session_handle = 1; 120 121 // Set to true if `CreateWorkerSession` was called for `session_handle`. 122 bool create_worker_session_called = 6; 123 124 // "graph_def" has the subgraph of nodes for this worker, with each node 125 // having its device_name filled in. 126 GraphDef graph_def = 2; 127 128 // True iff the graph (before partitioning) contains control flow nodes. 129 // 130 // As of 01/11/2015, this is no longer set by clients. 131 bool has_control_flow = 3 [deprecated = true]; 132 133 // Configuration options for the session in which this graph was created. 134 GraphOptions graph_options = 4; 135 136 // Field(s) used by TensorFlow Debugger (tfdbg). 137 DebugOptions debug_options = 5; 138 139 // If graph_def contains any collective ops this must be a positive 140 // integer used to coordinate execution with other graphs. All 141 // graphs in a distributed execution with the same 142 // collective_graph_key will coordinate to use the same step_id 143 // concurrently so that BufRendezvous entries will make the correct 144 // values accessible. 145 int64 collective_graph_key = 7; 146 147 // ConfigProto from the session in which this graph was created. 148 // Contains additional parameters beyond graph_options, including 149 // the name of the requested executor. 150 ConfigProto config_proto = 8; 151} 152 153message RegisterGraphResponse { 154 // If the registration succeeds, returns an opaque graph_handle to 155 // the master. The master calls RunGraph with graph_handle to 156 // compute different steps. 157 string graph_handle = 1; 158} 159 160//////////////////////////////////////////////////////////////////////////////// 161// 162// DeregisterGraph method request/response messages 163// 164// The master deregisters the given graph_handle when the graph is no 165// longer needed (e.g., the overall graph is re-scheduled and nodes 166// are re-placed). 167// 168// The worker deregisters a graph_handle automatically according to on 169// a TTL-base policy in case of master restarts. 170// 171//////////////////////////////////////////////////////////////////////////////// 172 173message DeregisterGraphRequest { 174 // The session_handle used when registering the graph. If session_handle is 175 // empty, a single global namespace is used. 176 string session_handle = 2; 177 178 // Set to true if `CreateWorkerSession` was called for `session_handle`. 179 bool create_worker_session_called = 3; 180 181 // REQUIRED: graph_handle must be returned by a RegisterGraph call 182 // to the same WorkerService. 183 string graph_handle = 1; 184} 185 186message DeregisterGraphResponse { 187 // TODO(mrry): Optionally add summary stats for the graph. 188} 189 190//////////////////////////////////////////////////////////////////////////////// 191// 192// CleanupAll method request/response messages 193// 194//////////////////////////////////////////////////////////////////////////////// 195 196message CleanupAllRequest { 197 // A list of container names. 198 // 199 // If 'container' is not empty, releases resources in the given 200 // containers in all devices. 201 // 202 // If 'container' is empty, releases resources in the default 203 // container in all devices. 204 repeated string container = 1; 205} 206 207message CleanupAllResponse {} 208 209//////////////////////////////////////////////////////////////////////////////// 210// 211// RunGraph request / response messages 212// 213// The worker executes all subgraphs registered under graph_handle. 214// RunGraph returns after the execution finishes or an error is 215// encountered. 216// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for 217// partial graph execution. 218// 219//////////////////////////////////////////////////////////////////////////////// 220 221// Options specific to the execution of a single step. 222message ExecutorOpts { 223 bool record_costs = 1; 224 bool record_timeline = 3; 225 bool record_partition_graphs = 4; 226 bool report_tensor_allocations_upon_oom = 5; 227} 228 229message RunGraphRequest { 230 // session_handle is the master-generated unique id for this session. 231 // If session_handle is non-empty, it must be the same as used when 232 // registering the graph. If it is empty, a single global namespace is used to 233 // search for the graph_handle. 234 string session_handle = 8; 235 236 // Set to true if `CreateWorkerSession` was called for `session_handle`. 237 bool create_worker_session_called = 10; 238 239 // REQUIRED: graph_handle must be returned by a RegisterGraph call 240 // to the same WorkerService. 241 string graph_handle = 1; 242 243 // A unique ID to distinguish different runs of the same graph. 244 // 245 // The master generates a global unique `step_id` to distinguish 246 // different runs of the graph computation. Subgraphs communicate 247 // (e.g., send/recv ops) with each other using `step_id` to 248 // distinguish tensors generated by different runs. 249 int64 step_id = 2; 250 251 // Options for this step. 252 ExecutorOpts exec_opts = 5; 253 254 // Runs the graph. 255 // 256 // Sends the tensors in "send" into the graph before the run and 257 // fetches the keys into `RunGraphResponse.recv` after the run. 258 repeated NamedTensorProto send = 3; 259 repeated string recv_key = 4; 260 261 // True if the RunGraphRequest is a partial run request. 262 bool is_partial = 6; 263 // True if this is the last partial run request in a sequence of requests. 264 bool is_last_partial_run = 7; 265 266 // If true then some errors, e.g., execution errors that have long 267 // error messages, may return an OK RunGraphResponse with the actual 268 // error saved in the status_code/status_error_message fields of the 269 // response body. This is a workaround since the RPC subsystem may 270 // truncate long metadata messages. 271 bool store_errors_in_response_body = 9; 272 273 // Unique identifier for this request. Every RunGraphRequest must have a 274 // unique request_id, and retried RunGraphRequests must have the same 275 // request_id. If request_id is zero, retry detection is disabled. 276 // 277 // Retried RunGraphRequests are problematic because they may issue a 278 // RecvTensor that will have no corresponding sender and will wait forever. 279 // Workers use request_ids to reject retried RunGraph requests instead of 280 // waiting forever. 281 int64 request_id = 11; 282 283 // Next: 12 284} 285 286message RunGraphResponse { 287 // A list of tensors corresponding to those requested by 288 // `RunGraphRequest.recv_key`. 289 repeated NamedTensorProto recv = 1; 290 291 // If the request asked for execution stats, the cost graph, or the partition 292 // graphs, these are returned here. 293 // TODO(suharshs): Package these in a RunMetadata instead. 294 StepStats step_stats = 2; 295 CostGraphDef cost_graph = 3; 296 repeated GraphDef partition_graph = 4; 297 298 // If store_errors_in_response_body is true in the request, then 299 // optionally the server may return an OK status for the RPC and 300 // fill the true status into the fields below, to allow for messages 301 // that are too long to fit in metadata. 302 error.Code status_code = 5; 303 string status_error_message = 6; 304} 305 306//////////////////////////////////////////////////////////////////////////////// 307// 308// CleanupGraph method request/response messages 309// 310// After the master receives RunGraph responses from all workers, the 311// master instructs every worker to cleanup any remaining state of a 312// step (e.g. tensors buffered by a `Send` op but not picked up by 313// other workers). The master does not necessarily need to wait for 314// completion of CleanupGraph calls. 315// 316// Workers should cleanup step states automatically according to a 317// TTL-based policy in case of master restarts. 318// 319//////////////////////////////////////////////////////////////////////////////// 320 321message CleanupGraphRequest { 322 int64 step_id = 1; 323} 324 325message CleanupGraphResponse {} 326 327//////////////////////////////////////////////////////////////////////////////// 328// 329// RecvTensor method request/response messages 330// 331//////////////////////////////////////////////////////////////////////////////// 332 333message RecvTensorRequest { 334 // The step in which the tensor will be produced. 335 // 336 // REQUIRED: This must eventually correspond to the `step_id` passed 337 // into a RunGraph call on the same WorkerService. 338 int64 step_id = 1; 339 340 // A key identifying the channel to receive tensors from. A RecvTensor request 341 // retrieves one tensor from the channel, but multiple tensors can be sent and 342 // received over the same channel with multiple RecvTensor requests. See 343 // rendezvous.h for details. 344 string rendezvous_key = 2; 345 346 // If true, use an out-of-band DMA mechanism to transfer the 347 // received tensor. 348 bool dma_ok = 3; 349 350 // Optional information on client-side device locality. 351 DeviceLocality client_locality = 4; 352 353 // Optional information on server-side device locality. 354 DeviceLocality server_locality = 5; 355 356 // Optional information needed by the RPC subsystem. 357 google.protobuf.Any transport_options = 6; 358 359 // Unique identifier for this request. Every RecvTensorRequest must have a 360 // unique request_id, and retried RecvTensorRequests must have the same 361 // request_id. If request_id is zero, retry detection and response cache 362 // are disabled. 363 // 364 // Retried RecvTensorRequests are problematic because a RecvTensor with no 365 // corresponding sender will wait forever, and the tensor may have been 366 // delivered to a previous retry. Workers use request_ids to reject retried 367 // RecvTensor requests instead of waiting forever. 368 int64 request_id = 7; 369} 370 371message RecvTensorResponse { 372 // The tensor as a proto. 373 TensorProto tensor = 1; 374 375 // If true, this tensor was the output of a dead node, and the 376 // content is invalid. 377 bool is_dead = 2; 378 379 // The time at which tensor was available and started to be returned. 380 int64 send_start_micros = 3; 381 382 // Optional additional information about how to receive the tensor, 383 // e.g. in the event that `RecvTensorRequest.dma_ok` was true. 384 google.protobuf.Any transport_options = 4; 385 386 // Whether the receiver should send a MarkRecvFinishedRequest to the sender 387 // to ack the message. 388 bool require_ack = 5; 389} 390 391// Message for managing the response cache maintained on the sender side. 392// Currently only used by the gRPC worker service. 393message MarkRecvFinishedRequest { 394 int64 request_id = 1; 395} 396 397message MarkRecvFinishedResponse {} 398 399//////////////////////////////////////////////////////////////////////////////// 400// 401// Logging method request/response messages 402// 403// NOTE(mrry): This feature is not supported in the open-source 404// version, and these messages are expected to change. 405// 406//////////////////////////////////////////////////////////////////////////////// 407 408// Out-of-band request to begin or end logging, or 409// to retrieve logs for particular steps. 410message LoggingRequest { 411 // If true, RPC logging will be enabled. 412 bool enable_rpc_logging = 1; 413 414 // If true, RPC logging will be disabled. 415 bool disable_rpc_logging = 4; 416 417 // If true, discard any saved logging data (for all steps). 418 bool clear = 2; 419 420 // When set, requests all saved log data pertaining to the step. 421 // Any log data retrieved is eliminated from the store and cannot be 422 // retrieved again. 423 repeated int64 fetch_step_id = 3; 424} 425 426message LabeledStepStats { 427 int64 step_id = 1; 428 StepStats step_stats = 2; 429} 430 431message LoggingResponse { 432 repeated LabeledStepStats step = 1; 433} 434 435//////////////////////////////////////////////////////////////////////////////// 436// 437// Tracing method request/response messages 438// 439// NOTE(mrry): This feature is not supported in the open-source 440// version, and these messages are expected to change. 441// 442//////////////////////////////////////////////////////////////////////////////// 443 444message TraceOpts { 445 // Length of the trace to be taken, in seconds. 446 double duration = 1; 447 // If true, capture step profile locally in each worker. Currently 448 // unimplemented. 449 bool use_step_profiler = 2; 450 // If true, capture kernel events from each worker. 451 bool use_kernel_profiler = 3; 452 // If true, capture extended profiling events from TensorFlow process. 453 bool use_extended_profiler = 4; 454 // If true, capture GPU profiling events locally on each 455 // machine. Currently unimplemented. 456 bool use_gpu_profiler = 5; 457 // If true, collect sampled profile events. Currently unimplemented. 458 bool use_sample_profiler = 6; 459} 460 461// Out-of-band request to configure distributed tracing. 462message TracingRequest { 463 TraceOpts options = 1; 464} 465 466message TracingResponse {} 467 468//////////////////////////////////////////////////////////////////////////////// 469// 470// Raw data transfers in support of Collective Ops. 471// These methods are experimental and subject to change. 472// 473// The intention is to allow collectives to take advantage of the most 474// efficient methods available on a platform, e.g. RDMA, and not be 475// constrained to use the RPC system in use by other methods. 476// 477//////////////////////////////////////////////////////////////////////////////// 478 479message RecvBufRequest { 480 // Use of the fields below may vary by implementation. For example 481 // the buf_ptr and num_bytes may be set only for local operations and 482 // not sent on the wire, or only sent on the wire in one direction. 483 484 // Used at server side to find the correct BufRendezvous. 485 int64 step_id = 1; 486 487 // Arbitrary string identifying a BufRendezvous entry. 488 string buf_rendezvous_key = 2; 489 490 // Size of value expected, must agree with BufRendezvous entry. 491 int64 num_bytes = 3; 492 493 // When RDMA is in use, address of destination field on client. 494 fixed64 buf_ptr = 4; 495 496 // Optional information on client-side device locality. 497 DeviceLocality client_locality = 5; 498 499 // Optional information on server-side device locality. 500 DeviceLocality server_locality = 6; 501 502 // Optional, implementation-specific data. 503 google.protobuf.Any transport_options = 7; 504 // For annotating timeline and device incarnation check. 505 string src_device = 8; 506 // Optional, for annotating the timeline. 507 string dst_device = 9; 508 509 // Depending on the RPC system in use, it may be necessary to set this 510 // id to detect resends of RPCs where the server is not aware that 511 // the prior RPC failed. 512 int64 request_id = 10; 513 514 // Incarnation number of the source device, used to detect worker failures. 515 uint64 src_incarnation = 11; 516} 517 518message RecvBufResponse { 519 // Use of the fields below may vary by implementation. Comments give 520 // intended use. 521 522 fixed64 buf_ptr = 1; // Address of source field on server. 523 int64 num_bytes = 2; // Byte length of buf_ptr field, if set. 524 bool is_dead = 3; // True if value is 'dead' like a tensor. 525 // Optional, implementation-specific data. 526 google.protobuf.Any transport_options = 4; 527 // Optional, for timeline. 528 int64 send_start_micros = 5; 529 530 // Whether the receiver should send a MarkRecvFinishedRequest to the sender 531 // to ack the message. 532 bool require_ack = 6; 533} 534 535//////////////////////////////////////////////////////////////////////////////// 536// 537// Collective Op dynamic group resolution messages. 538// 539//////////////////////////////////////////////////////////////////////////////// 540 541// Supplies one or more device names as members of the group identified by 542// group_key. Service will respond when all group_size devices become known. 543// All devices in group must have same type. 544message CompleteGroupRequest { 545 int32 group_key = 1; 546 int32 group_size = 2; 547 string device_type = 3; 548 int32 collective_type = 5; 549 DeviceAttributes device_attributes = 6; 550 551 reserved 4; 552} 553 554// Gives the complete membership of the group identified by group_key. 555message CompleteGroupResponse { 556 int32 group_key = 1; 557 int32 group_size = 2; 558 string device_type = 3; 559 int32 num_tasks = 4; // number of distinct tasks hosting the devices 560 bytes communicator_key = 7; 561 repeated DeviceAttributes device_attributes = 8; 562 563 reserved 5, 6; 564} 565 566// Supplies data about one collective op belonging to the instance identified 567// by instance_key. Service will respond when all group_size ops have 568// become known. Most of the data being sent is for correctness checking, 569// to ensure that all ops in the instance share common attributes. 570message CompleteInstanceRequest { 571 string name = 1; 572 int32 type = 2; 573 DataType data_type = 3; 574 TensorShapeProto shape = 4; 575 int32 group_key = 5; 576 int32 group_size = 6; 577 int32 instance_key = 7; 578 string device_type = 8; 579 repeated int32 subdiv_offset = 9; 580 string device = 10; 581 bool is_source = 11; 582} 583 584// Confirms that every op in the instance has consistently declared itself. 585// Also gives the source_rank in case of broadcast. 586message CompleteInstanceResponse { 587 int32 instance_key = 1; 588 int32 source_rank = 2; 589 reserved 3; 590} 591 592// Request for next agreed-upon step_id for the specified graph_keys. 593// This is used to enable multiple graphs containing nodes from 594// a common collective instance to coordinate using the same step_ids. 595message GetStepSequenceRequest { 596 repeated int64 graph_key = 1; 597} 598 599message StepSequence { 600 int64 graph_key = 1; 601 int64 next_step_id = 2; 602} 603 604// Next valid step_ids for one or more graph_keys. 605message GetStepSequenceResponse { 606 repeated StepSequence step_sequence = 1; 607} 608