1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_ 18 #define MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_ 19 20 #include <map> 21 #include <set> 22 #include <mutex> 23 #include <string> 24 #include <memory> 25 #include <atomic> 26 #include <utility> 27 #include "include/backend/distributed/constants.h" 28 #include "utils/log_adapter.h" 29 #include "utils/ms_utils.h" 30 #include "include/backend/distributed/cluster/topology/node_base.h" 31 #include "include/backend/visible.h" 32 33 namespace mindspore { 34 namespace ps::core { 35 struct ClusterConfig; 36 } // namespace ps::core 37 namespace distributed { 38 namespace cluster { 39 // The environment variable name represents the node id of a certain process(compute graph node). 40 constexpr char kNodeId[] = "MS_NODE_ID"; 41 class ActorRouteTableProxy; 42 // Node role based cluster built by MindSpore communication framework. 43 class BACKEND_EXPORT ClusterContext { 44 public: 45 ~ClusterContext(); 46 DISABLE_COPY_AND_ASSIGN(ClusterContext) 47 static std::shared_ptr<ClusterContext> instance(); 48 49 // Initialize the cluster configuration and build network. 50 bool Initialize(); 51 52 // Finalize the cluster and process exits. If timeout is set to UINT32_MAX, this method will block without timeout. 53 bool Finalize(uint32_t timeout = kDefaultFinishTimeout); 54 55 // Return whether this node is the scheduler node. 56 // In a cluster, the scheduler node is special because it's responsible for building network. 57 bool IsScheduler(); 58 59 // Return node object of this process. 60 const std::shared_ptr<topology::NodeBase> &node() const; 61 62 // Return the shadow node. 63 const std::shared_ptr<topology::NodeBase> &node_base() const; 64 65 // Return node role in this cluster. 66 const std::string &node_role() const; 67 68 // Returns total number of the specified node role. This is used as the group size of this node role. 69 uint32_t node_num(const std::string &node_role); 70 71 // Returns the total number of various role nodes. 72 uint32_t node_num() const; 73 74 // Return cluster is initialized. 75 bool initialized() const; 76 77 // Return actor route proxy for AbstractNode. 78 const std::shared_ptr<ActorRouteTableProxy> &actor_route_table_proxy() const; 79 80 // Get and set whether this process exits with exception. 81 void set_cluster_exit_with_exception(); 82 bool cluster_exit_with_exception() const; 83 84 // Return server range of this node. port_range()85 const std::pair<uint32_t, uint32_t> &port_range() const { return port_range_; } 86 87 private: 88 ClusterContext(); 89 90 // This initializing cluster configurations. They can be exported by environment variables, set by python API or 91 // configuration file. 92 void InitClusterConfig(); 93 94 // Build the cluster with other processes. This method will not return until the networking is done. 95 bool BuildCluster(); 96 97 // Load the cluster configuration like worker number, server number and etc. 98 void InitNodeRole(); 99 void InitSchedulerIp(); 100 void InitSchedulerPort(); 101 102 // After cluster is successfully built, some post process should be done. For example, port range assignment and 103 // client ip set, etc. 104 void PostProcess(); 105 106 // The flag that whether this cluster context instance is already initialized. 107 std::atomic_bool inited_; 108 109 // The flag that whether this cluster context instance is already finalized. 110 std::atomic_bool finalized_; 111 112 // The mutex about exiting status of this node. 113 std::mutex finish_mutex_; 114 115 // Whether the process in this cluster exits with any python exception. 116 bool cluster_exit_with_exception_; 117 118 // Node role to role number map. 119 std::map<std::string, uint32_t> node_num_each_role_; 120 121 // Scheduler information. 122 std::string scheduler_host_; 123 uint16_t scheduler_port_; 124 125 // The compute graph node or meta server node according to the configuration of this process. 126 std::shared_ptr<topology::NodeBase> node_base_; 127 128 // Node id of this process in the cluster. 129 std::string node_id_; 130 131 // The role of this process in the cluster. 132 std::string node_role_; 133 134 // The configuration of this cluster. 135 std::unique_ptr<ps::core::ClusterConfig> cluster_config_; 136 137 // The actor route table proxy. It only created in abstract nodes because scheduler does not use proxy. 138 std::shared_ptr<ActorRouteTableProxy> actor_route_table_proxy_; 139 140 std::pair<uint32_t, uint32_t> port_range_; 141 }; 142 } // namespace cluster 143 } // namespace distributed 144 } // namespace mindspore 145 #endif // MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_ 146