• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_
18 #define MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_
19 
20 #include <string>
21 #include <iostream>
22 #include <memory>
23 #include <utility>
24 #include <unordered_map>
25 
26 #include "utils/log_adapter.h"
27 #include "ps/core/node_info.h"
28 
29 namespace mindspore {
30 namespace ps {
31 namespace core {
32 constexpr uint32_t kHearbeatInterval = 3;
33 constexpr uint32_t kHearbeatTimeout = 30;
34 constexpr uint32_t kPersistentInterval = 300;
35 constexpr uint32_t kClusterAvailableTimeout = 900;
36 constexpr uint32_t kConnectInterval = 3000;
37 constexpr int64_t kSchedTimeout = 30;
38 /*
39  * Configuration information read through environment variables and configuration files, generally immutable
40  */
41 struct ClusterConfig {
ClusterConfigClusterConfig42   explicit ClusterConfig(const uint32_t &worker_num, const uint32_t &server_num, std::string host, const uint16_t &port)
43       : initial_worker_num(worker_num),
44         initial_server_num(server_num),
45         heartbeat_interval(kHearbeatInterval),
46         persistent_interval(kPersistentInterval),
47         scheduler_host(host),
48         scheduler_port(port),
49         heartbeat_timeout(kHearbeatTimeout),
50         cluster_available_timeout(kClusterAvailableTimeout),
51         connect_interval(kConnectInterval),
52         scheduler_timeout(kSchedTimeout),
53         initial_total_node_num(0),
54         initial_next_worker_rank_id(0),
55         initial_next_server_rank_id(0),
56         initial_cluster_state(ClusterState::CLUSTER_STARTING) {}
57   // Configure through environment variables:MS_WORKER_NUM
58   uint32_t initial_worker_num;
59   // Configure through environment variables:MS_SERVER_NUM
60   uint32_t initial_server_num;
61 
62   // The interval for sending heartbeat packets between worker node,server node and scheduler node is 3 seconds.
63   uint32_t heartbeat_interval;
64   // Persistent storage time interval, sent by the scheduler to each node that needs persistence at equal intervals of
65   // 300 seconds.
66   uint32_t persistent_interval;
67   std::string scheduler_host;
68   uint16_t scheduler_port;
69   // The timeout for worker node and server node sending heartbeat packets to scheduler node is 30 seconds.
70   uint32_t heartbeat_timeout;
71   // Timeout period for cluster preparation is 900 seconds.
72   uint32_t cluster_available_timeout;
73   // The timeout period for the client to connect to the server is 3000ms.
74   uint32_t connect_interval;
75   // When the scheduler exits, the worker and server can continue to work for 5 hours
76   int64_t scheduler_timeout;
77   // the node that has bean registered to scheduler
78   std::unordered_map<std::string, NodeInfo> initial_registered_nodes_infos;
79   uint32_t initial_total_node_num;
80   uint32_t initial_next_worker_rank_id;
81   uint32_t initial_next_server_rank_id;
82   ClusterState initial_cluster_state;
83 };
84 }  // namespace core
85 }  // namespace ps
86 }  // namespace mindspore
87 #endif  // MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_
88