1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_ 18 #define MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_ 19 20 #include <string> 21 #include <iostream> 22 #include <memory> 23 #include <utility> 24 #include <unordered_map> 25 26 #include "utils/log_adapter.h" 27 #include "ps/core/node_info.h" 28 29 namespace mindspore { 30 namespace ps { 31 namespace core { 32 constexpr uint32_t kHearbeatInterval = 3; 33 constexpr uint32_t kHearbeatTimeout = 30; 34 constexpr uint32_t kPersistentInterval = 300; 35 constexpr uint32_t kClusterAvailableTimeout = 900; 36 constexpr uint32_t kConnectInterval = 3000; 37 constexpr int64_t kSchedTimeout = 30; 38 /* 39 * Configuration information read through environment variables and configuration files, generally immutable 40 */ 41 struct ClusterConfig { ClusterConfigClusterConfig42 explicit ClusterConfig(const uint32_t &worker_num, const uint32_t &server_num, std::string host, const uint16_t &port) 43 : initial_worker_num(worker_num), 44 initial_server_num(server_num), 45 heartbeat_interval(kHearbeatInterval), 46 persistent_interval(kPersistentInterval), 47 scheduler_host(host), 48 scheduler_port(port), 49 heartbeat_timeout(kHearbeatTimeout), 50 cluster_available_timeout(kClusterAvailableTimeout), 51 connect_interval(kConnectInterval), 52 scheduler_timeout(kSchedTimeout), 53 initial_total_node_num(0), 54 initial_next_worker_rank_id(0), 55 initial_next_server_rank_id(0), 56 initial_cluster_state(ClusterState::CLUSTER_STARTING) {} 57 // Configure through environment variables:MS_WORKER_NUM 58 uint32_t initial_worker_num; 59 // Configure through environment variables:MS_SERVER_NUM 60 uint32_t initial_server_num; 61 62 // The interval for sending heartbeat packets between worker node,server node and scheduler node is 3 seconds. 63 uint32_t heartbeat_interval; 64 // Persistent storage time interval, sent by the scheduler to each node that needs persistence at equal intervals of 65 // 300 seconds. 66 uint32_t persistent_interval; 67 std::string scheduler_host; 68 uint16_t scheduler_port; 69 // The timeout for worker node and server node sending heartbeat packets to scheduler node is 30 seconds. 70 uint32_t heartbeat_timeout; 71 // Timeout period for cluster preparation is 900 seconds. 72 uint32_t cluster_available_timeout; 73 // The timeout period for the client to connect to the server is 3000ms. 74 uint32_t connect_interval; 75 // When the scheduler exits, the worker and server can continue to work for 5 hours 76 int64_t scheduler_timeout; 77 // the node that has bean registered to scheduler 78 std::unordered_map<std::string, NodeInfo> initial_registered_nodes_infos; 79 uint32_t initial_total_node_num; 80 uint32_t initial_next_worker_rank_id; 81 uint32_t initial_next_server_rank_id; 82 ClusterState initial_cluster_state; 83 }; 84 } // namespace core 85 } // namespace ps 86 } // namespace mindspore 87 #endif // MINDSPORE_CCSRC_PS_CORE_CLUSTER_CONFIG_H_ 88