• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1syntax = "proto3";
2
3package tensorflow;
4
5option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
6
7// Coordination service configuration parameters.
8// The system picks appropriate values for fields that are not set.
9message CoordinationServiceConfig {
10  // Type of coordination service implementation to enable.
11  // For example, setting the service type as "standalone" starts a service
12  // instance on the leader task to provide the coordination services such as
13  // heartbeats and consistent key-value store.
14  string service_type = 1;
15
16  // Address where the coordination service instance is hosted.
17  string service_leader = 2;
18
19  // Whether to enable the health check mechanism.
20  bool enable_health_check = 3;
21
22  // Maximum wait time for all members in the cluster to be registered.
23  int64 cluster_register_timeout_in_ms = 4;
24
25  // Heartbeat timeout, if a task does not record heartbeat in this time
26  // window, it will be considered disconnected.
27  // Note: This is also used as a grace period to accept any heartbeats after
28  // the agent has disconnected, to account for the lag time between the service
29  // recording the state change and the agent stopping heartbeats.
30  int64 heartbeat_timeout_in_ms = 5;
31
32  // The list of jobs that partipate in the coordination service. If empty, all
33  // jobs will be included in the coordination service by default.
34  repeated string coordinated_jobs = 6;
35
36  // Denotes how long to wait for all coordination agents to reach the barriers
37  // (after the first shutdown request) before disconnecting together. If
38  // set to 0, no barrier is imposed upon shutdown and each worker can
39  // disconnect individually.
40  int64 shutdown_barrier_timeout_in_ms = 7;
41
42  // If set, agents do not make an explicit Shutdown() call. Service will only
43  // find out about the disconnecte agent via stale heartbeats. Used for
44  // testing.
45  bool agent_destruction_without_shutdown = 8;
46
47  // The list of jobs which are recoverable. If a task in this list fails,
48  // it will not propagate error to other tasks.
49  // If empty, no jobs will be recoverable and every task failure will cause
50  // error propagation to other tasks.
51  repeated string recoverable_jobs = 9;
52}
53