1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16syntax = "proto3"; 17 18package tensorflow; 19option cc_enable_arenas = true; 20option java_outer_classname = "WorkerProtos"; 21option java_multiple_files = true; 22option java_package = "org.tensorflow.distruntime"; 23option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf"; 24import "google/protobuf/any.proto"; 25import "tensorflow/core/framework/cost_graph.proto"; 26import "tensorflow/core/framework/step_stats.proto"; 27import "tensorflow/core/framework/device_attributes.proto"; 28import "tensorflow/core/framework/graph.proto"; 29import "tensorflow/core/framework/tensor.proto"; 30import "tensorflow/core/framework/tensor_shape.proto"; 31import "tensorflow/core/framework/types.proto"; 32import "tensorflow/core/lib/core/error_codes.proto"; 33import "tensorflow/core/protobuf/config.proto"; 34import "tensorflow/core/protobuf/debug.proto"; 35import "tensorflow/core/protobuf/named_tensor.proto"; 36import "tensorflow/core/protobuf/tensorflow_server.proto"; 37 38//////////////////////////////////////////////////////////////////////////////// 39// 40// GetStatus method request/response messages 41// 42//////////////////////////////////////////////////////////////////////////////// 43 44message GetStatusRequest { 45} 46 47message GetStatusResponse { 48 repeated DeviceAttributes device_attributes = 1; 49} 50 51//////////////////////////////////////////////////////////////////////////////// 52// 53// CreateSession method request/response messages 54// 55// For each session, 56// 57//////////////////////////////////////////////////////////////////////////////// 58 59message CreateWorkerSessionRequest { 60 // Sessions are identified by a given handle. 61 string session_handle = 1; 62 63 // Defines the configuration of a TensorFlow worker. 64 ServerDef server_def = 2; 65 66 // If true, any resources such as Variables used in the session will not be 67 // shared with other sessions. 68 bool isolate_session_state = 3; 69} 70 71message CreateWorkerSessionResponse { 72} 73 74//////////////////////////////////////////////////////////////////////////////// 75// 76// DeleteSession method request/response messages 77// 78// Deletes all worker-side state associated with the given session handle. 79// 80//////////////////////////////////////////////////////////////////////////////// 81 82message DeleteWorkerSessionRequest { 83 // Sessions are identified by a given handle. 84 string session_handle = 1; 85} 86 87message DeleteWorkerSessionResponse { 88} 89 90//////////////////////////////////////////////////////////////////////////////// 91// 92// RegisterGraph method request/response messages 93// 94// For each session, after the master placed every node on a device, 95// it partitions the whole graph into many subgraphs. All the nodes in 96// a subgraph were in the same worker, but potentially on many devices 97// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The 98// master registers subgraphs for a worker before running any steps. A 99// successful registration returns a graph handle to be used in latter 100// RunGraph requests. 101// 102//////////////////////////////////////////////////////////////////////////////// 103 104message RegisterGraphRequest { 105 // Subgraphs are scoped within one session. 106 string session_handle = 1; 107 108 // Set to true if `CreateWorkerSession` was called for `session_handle`. 109 bool create_worker_session_called = 6; 110 111 // "graph_def" has the subgraph of nodes for this worker, with each node 112 // having its device_name filled in. 113 GraphDef graph_def = 2; 114 115 // True iff the graph (before partitioning) contains control flow nodes. 116 // 117 // As of 01/11/2015, this is no longer set by clients. 118 bool has_control_flow = 3 [deprecated = true]; 119 120 // Configuration options for the session in which this graph was created. 121 GraphOptions graph_options = 4; 122 123 // Field(s) used by TensorFlow Debugger (tfdbg). 124 DebugOptions debug_options = 5; 125 126 // If graph_def contains any collective ops this must be a positive 127 // integer used to coordinate execution with other graphs. All 128 // graphs in a distributed execution with the same 129 // collective_graph_key will coordinate to use the same step_id 130 // concurrently so that BufRendezvous entries will make the correct 131 // values accessible. 132 int64 collective_graph_key = 7; 133} 134 135message RegisterGraphResponse { 136 // If the registration succeeds, returns an opaque graph_handle to 137 // the master. The master calls RunGraph with graph_handle to 138 // compute different steps. 139 string graph_handle = 1; 140} 141 142//////////////////////////////////////////////////////////////////////////////// 143// 144// DeregisterGraph method request/response messages 145// 146// The master deregisters the given graph_handle when the graph is no 147// longer needed (e.g., the overall graph is re-scheduled and nodes 148// are re-placed). 149// 150// The worker deregisters a graph_handle automatically according to on 151// a TTL-base policy in case of master restarts. 152// 153//////////////////////////////////////////////////////////////////////////////// 154 155message DeregisterGraphRequest { 156 // The session_handle used when registering the graph. If session_handle is 157 // empty, a single global namespace is used. 158 string session_handle = 2; 159 160 // Set to true if `CreateWorkerSession` was called for `session_handle`. 161 bool create_worker_session_called = 3; 162 163 // REQUIRED: graph_handle must be returned by a RegisterGraph call 164 // to the same WorkerService. 165 string graph_handle = 1; 166} 167 168message DeregisterGraphResponse { 169 // TODO(mrry): Optionally add summary stats for the graph. 170} 171 172//////////////////////////////////////////////////////////////////////////////// 173// 174// CleanupAll method request/response messages 175// 176//////////////////////////////////////////////////////////////////////////////// 177 178message CleanupAllRequest { 179 // A list of container names. 180 // 181 // If 'container' is not empty, releases resources in the given 182 // containers in all devices. 183 // 184 // If 'container' is empty, releases resources in the default 185 // container in all devices. 186 repeated string container = 1; 187} 188 189message CleanupAllResponse { 190} 191 192//////////////////////////////////////////////////////////////////////////////// 193// 194// RunGraph request / response messages 195// 196// The worker executes all subgraphs registered under graph_handle. 197// RunGraph returns after the execution finishes or an error is 198// encountered. 199// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for 200// partial graph execution. 201// 202//////////////////////////////////////////////////////////////////////////////// 203 204// Options specific to the execution of a single step. 205message ExecutorOpts { 206 bool record_costs = 1; 207 bool record_timeline = 3; 208 bool record_partition_graphs = 4; 209 bool report_tensor_allocations_upon_oom = 5; 210}; 211 212message RunGraphRequest { 213 // session_handle is the master-generated unique id for this session. 214 // If session_handle is non-empty, it must be the same as used when 215 // registering the graph. If it is empty, a single global namespace is used to 216 // search for the graph_handle. 217 string session_handle = 8; 218 219 // Set to true if `CreateWorkerSession` was called for `session_handle`. 220 bool create_worker_session_called = 10; 221 222 // REQUIRED: graph_handle must be returned by a RegisterGraph call 223 // to the same WorkerService. 224 string graph_handle = 1; 225 226 // A unique ID to distinguish different runs of the same graph. 227 // 228 // The master generates a global unique `step_id` to distinguish 229 // different runs of the graph computation. Subgraphs communicate 230 // (e.g., send/recv ops) with each other using `step_id` to 231 // distinguish tensors generated by different runs. 232 int64 step_id = 2; 233 234 // Options for this step. 235 ExecutorOpts exec_opts = 5; 236 237 // Runs the graph. 238 // 239 // Sends the tensors in "send" into the graph before the run and 240 // fetches the keys into `RunGraphResponse.recv` after the run. 241 repeated NamedTensorProto send = 3; 242 repeated string recv_key = 4; 243 244 // True if the RunGraphRequest is a partial run request. 245 bool is_partial = 6; 246 // True if this is the last partial run request in a sequence of requests. 247 bool is_last_partial_run = 7; 248 249 // If true then some errors, e.g., execution errors that have long 250 // error messages, may return an OK RunGraphResponse with the actual 251 // error saved in the status_code/status_error_message fields of the 252 // response body. This is a workaround since the RPC subsystem may 253 // truncate long metadata messages. 254 bool store_errors_in_response_body = 9; 255 256 // Next: 11 257} 258 259message RunGraphResponse { 260 // A list of tensors corresponding to those requested by 261 // `RunGraphRequest.recv_key`. 262 repeated NamedTensorProto recv = 1; 263 264 // If the request asked for execution stats, the cost graph, or the partition 265 // graphs, these are returned here. 266 // TODO(suharshs): Package these in a RunMetadata instead. 267 StepStats step_stats = 2; 268 CostGraphDef cost_graph = 3; 269 repeated GraphDef partition_graph = 4; 270 271 // If store_errors_in_response_body is true in the request, then 272 // optionally the server may return an OK status for the RPC and 273 // fill the true status into the fields below, to allow for messages 274 // that are too long to fit in metadata. 275 error.Code status_code = 5; 276 string status_error_message = 6; 277} 278 279//////////////////////////////////////////////////////////////////////////////// 280// 281// CleanupGraph method request/response messages 282// 283// After the master receives RunGraph responses from all workers, the 284// master instructs every worker to cleanup any remaining state of a 285// step (e.g. tensors buffered by a `Send` op but not picked up by 286// other workers). The master does not necessarily need to wait for 287// completion of CleanupGraph calls. 288// 289// Workers should cleanup step states automatically according to a 290// TTL-based policy in case of master restarts. 291// 292//////////////////////////////////////////////////////////////////////////////// 293 294message CleanupGraphRequest { 295 int64 step_id = 1; 296} 297 298message CleanupGraphResponse { 299} 300 301//////////////////////////////////////////////////////////////////////////////// 302// 303// RecvTensor method request/response messages 304// 305//////////////////////////////////////////////////////////////////////////////// 306 307message RecvTensorRequest { 308 // The step in which the tensor will be produced. 309 // 310 // REQUIRED: This must eventually correspond to the `step_id` passed 311 // into a RunGraph call on the same WorkerService. 312 int64 step_id = 1; 313 314 // A key identifying the channel to receive tensors from. A RecvTensor request 315 // retrieves one tensor from the channel, but multiple tensors can be sent and 316 // received over the same channel with multiple RecvTensor requests. See 317 // rendezvous.h for details. 318 string rendezvous_key = 2; 319 320 // If true, use an out-of-band DMA mechanism to transfer the 321 // received tensor. 322 bool dma_ok = 3; 323 324 // Optional information on client-side device locality. 325 DeviceLocality client_locality = 4; 326 327 // Optional information on server-side device locality. 328 DeviceLocality server_locality = 5; 329 330 // Optional information needed by the RPC subsystem. 331 google.protobuf.Any transport_options = 6; 332 333 // Unique identifier for this request. Every RecvTensorRequest must have a 334 // unique request_id, and retried RecvTensorRequests must have the same 335 // request_id. If request_id is zero, retry detection is disabled. 336 // 337 // Retried RecvTensorRequests are problematic because a RecvTensor with no 338 // corresponding sender will wait forever, and the tensor may have been 339 // delivered to a previous retry. Workers use request_ids to reject retried 340 // RecvTensor requests instead of waiting forever. 341 int64 request_id = 7; 342} 343 344message RecvTensorResponse { 345 // The tensor as a proto. 346 TensorProto tensor = 1; 347 348 // If true, this tensor was the output of a dead node, and the 349 // content is invalid. 350 bool is_dead = 2; 351 352 // The time at which tensor was available and started to be returned. 353 int64 send_start_micros = 3; 354 355 // Optional additional information about how to receive the tensor, 356 // e.g. in the event that `RecvTensorRequest.dma_ok` was true. 357 google.protobuf.Any transport_options = 4; 358} 359 360//////////////////////////////////////////////////////////////////////////////// 361// 362// Logging method request/response messages 363// 364// NOTE(mrry): This feature is not supported in the open-source 365// version, and these messages are expected to change. 366// 367//////////////////////////////////////////////////////////////////////////////// 368 369// Out-of-band request to begin or end logging, or 370// to retrieve logs for particular steps. 371message LoggingRequest { 372 // If true, RPC logging will be enabled. 373 bool enable_rpc_logging = 1; 374 375 // If true, RPC logging will be disabled. 376 bool disable_rpc_logging = 4; 377 378 // If true, discard any saved logging data (for all steps). 379 bool clear = 2; 380 381 // When set, requests all saved log data pertaining to the step. 382 // Any log data retrieved is eliminated from the store and cannot be 383 // retrieved again. 384 repeated int64 fetch_step_id = 3; 385} 386 387message LabeledStepStats { 388 int64 step_id = 1; 389 StepStats step_stats = 2; 390} 391 392message LoggingResponse { 393 repeated LabeledStepStats step = 1; 394} 395 396//////////////////////////////////////////////////////////////////////////////// 397// 398// Tracing method request/response messages 399// 400// NOTE(mrry): This feature is not supported in the open-source 401// version, and these messages are expected to change. 402// 403//////////////////////////////////////////////////////////////////////////////// 404 405message TraceOpts { 406 // Length of the trace to be taken, in seconds. 407 double duration = 1; 408 // If true, capture step profile locally in each worker. Currently 409 // unimplemented. 410 bool use_step_profiler = 2; 411 // If true, capture kernel events from each worker. 412 bool use_kernel_profiler = 3; 413 // If true, capture extended profiling events from TensorFlow process. 414 bool use_extended_profiler = 4; 415 // If true, capture GPU profiling events locally on each 416 // machine. Currently unimplemented. 417 bool use_gpu_profiler = 5; 418 // If true, collect sampled profile events. Currently unimplemented. 419 bool use_sample_profiler = 6; 420} 421 422// Out-of-band request to configure distributed tracing. 423message TracingRequest { 424 TraceOpts options = 1; 425} 426 427message TracingResponse { 428} 429 430//////////////////////////////////////////////////////////////////////////////// 431// 432// Raw data transfers in support of Collective Ops. 433// These methods are experimental and subject to change. 434// 435// The intention is to allow collectives to take advantage of the most 436// efficient methods available on a platform, e.g. RDMA, and not be 437// constrained to use the RPC system in use by other methods. 438// 439//////////////////////////////////////////////////////////////////////////////// 440 441message RecvBufRequest { 442 // Use of the fields below may vary by implementation. For example 443 // the buf_ptr and num_bytes may be set only for local operations and 444 // not sent on the wire, or only sent on the wire in one direction. 445 446 // Used at server side to find the correct BufRendezvous. 447 int64 step_id = 1; 448 449 // Arbitrary string identifying a BufRendezvous entry. 450 string buf_rendezvous_key = 2; 451 452 // Size of value expected, must agree with BufRendezvous entry. 453 int64 num_bytes = 3; 454 455 // When RDMA is in use, address of destination field on client. 456 fixed64 buf_ptr = 4; 457 458 // Optional information on client-side device locality. 459 DeviceLocality client_locality = 5; 460 461 // Optional information on server-side device locality. 462 DeviceLocality server_locality = 6; 463 464 // Optional, implementation-specific data. 465 google.protobuf.Any transport_options = 7; 466 // Optional, for annotating the timeline. 467 string src_device = 8; 468 string dst_device = 9; 469 470 // Depending on the RPC system in use, it may be necessary to set this 471 // id to detect resends of RPCs where the server is not aware that 472 // the prior RPC failed. 473 int64 request_id = 10; 474} 475 476message RecvBufResponse { 477 // Use of the fields below may vary by implementation. Comments give 478 // intended use. 479 480 fixed64 buf_ptr = 1; // Address of source field on server. 481 int64 num_bytes = 2; // Byte length of buf_ptr field, if set. 482 bool is_dead = 3; // True if value is 'dead' like a tensor. 483 // Optional, implementation-specific data. 484 google.protobuf.Any transport_options = 4; 485 // Optional, for timeline. 486 int64 send_start_micros = 5; 487} 488 489//////////////////////////////////////////////////////////////////////////////// 490// 491// Collective Op dynamic group resolution messages. 492// 493//////////////////////////////////////////////////////////////////////////////// 494 495// Supplies one or more device names as members of the group identified by 496// group_key. Service will respond when all group_size devices become known. 497// All devices in group must have same type. 498message CompleteGroupRequest { 499 int32 group_key = 1; 500 int32 group_size = 2; 501 string device_type = 3; 502 repeated string device_name = 4; 503} 504 505// Gives the complete membership of the group identified by group_key. 506message CompleteGroupResponse { 507 int32 group_key = 1; 508 int32 group_size = 2; 509 string device_type = 3; 510 int32 num_tasks = 4; // number of distinct tasks hosting the devices 511 repeated string device_name = 5; 512 repeated string task_name = 6; // task name prefixes of device_names 513} 514 515// Supplies data about one collective op belonging to the instance identified 516// by instance_key. Service will respond when all group_size ops have 517// become known. Most of the data being sent is for correctness checking, 518// to ensure that all ops in the instance share common attributes. 519message CompleteInstanceRequest { 520 string name = 1; 521 int32 type = 2; 522 DataType data_type = 3; 523 TensorShapeProto shape = 4; 524 int32 group_key = 5; 525 int32 group_size = 6; 526 int32 instance_key = 7; 527 string device_type = 8; 528 repeated int32 subdiv_offset = 9; 529 string device = 10; 530 bool is_source = 11; 531} 532 533// Confirms that every op in the instance has consistently declared itself. 534// Also gives the source_rank in case of broadcast. 535message CompleteInstanceResponse { 536 int32 instance_key = 1; 537 int32 source_rank = 2; 538 bytes communicator_key = 3; 539} 540 541// Request for next agreed-upon step_id for the specified graph_keys. 542// This is used to enable multiple graphs containing nodes from 543// a common collective instance to coordinate using the same step_ids. 544message GetStepSequenceRequest { 545 repeated int64 graph_key = 1; 546} 547 548message StepSequence { 549 int64 graph_key = 1; 550 int64 next_step_id = 2; 551} 552 553// Next valid step_ids for one or more graph_keys. 554message GetStepSequenceResponse { 555 repeated StepSequence step_sequence = 1; 556} 557