1syntax = "proto3"; 2 3package tensorflow; 4 5option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; 6 7// Coordination service configuration parameters. 8// The system picks appropriate values for fields that are not set. 9message CoordinationServiceConfig { 10 // Type of coordination service implementation to enable. 11 // For example, setting the service type as "standalone" starts a service 12 // instance on the leader task to provide the coordination services such as 13 // heartbeats and consistent key-value store. 14 string service_type = 1; 15 16 // Address where the coordination service instance is hosted. 17 string service_leader = 2; 18 19 // Whether to enable the health check mechanism. 20 bool enable_health_check = 3; 21 22 // Maximum wait time for all members in the cluster to be registered. 23 int64 cluster_register_timeout_in_ms = 4; 24 25 // Heartbeat timeout, if a task does not record heartbeat in this time 26 // window, it will be considered disconnected. 27 // Note: This is also used as a grace period to accept any heartbeats after 28 // the agent has disconnected, to account for the lag time between the service 29 // recording the state change and the agent stopping heartbeats. 30 int64 heartbeat_timeout_in_ms = 5; 31 32 // The list of jobs that partipate in the coordination service. If empty, all 33 // jobs will be included in the coordination service by default. 34 repeated string coordinated_jobs = 6; 35 36 // Denotes how long to wait for all coordination agents to reach the barriers 37 // (after the first shutdown request) before disconnecting together. If 38 // set to 0, no barrier is imposed upon shutdown and each worker can 39 // disconnect individually. 40 int64 shutdown_barrier_timeout_in_ms = 7; 41 42 // If set, agents do not make an explicit Shutdown() call. Service will only 43 // find out about the disconnecte agent via stale heartbeats. Used for 44 // testing. 45 bool agent_destruction_without_shutdown = 8; 46 47 // The list of jobs which are recoverable. If a task in this list fails, 48 // it will not propagate error to other tasks. 49 // If empty, no jobs will be recoverable and every task failure will cause 50 // error propagation to other tasks. 51 repeated string recoverable_jobs = 9; 52} 53