• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_
18 #define MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_
19 
20 #include <map>
21 #include <set>
22 #include <mutex>
23 #include <string>
24 #include <memory>
25 #include <atomic>
26 #include <utility>
27 #include "include/backend/distributed/constants.h"
28 #include "utils/log_adapter.h"
29 #include "utils/ms_utils.h"
30 #include "include/backend/distributed/cluster/topology/node_base.h"
31 #include "include/backend/visible.h"
32 
33 namespace mindspore {
34 namespace ps::core {
35 struct ClusterConfig;
36 }  // namespace ps::core
37 namespace distributed {
38 namespace cluster {
39 // The environment variable name represents the node id of a certain process(compute graph node).
40 constexpr char kNodeId[] = "MS_NODE_ID";
41 class ActorRouteTableProxy;
42 // Node role based cluster built by MindSpore communication framework.
43 class BACKEND_EXPORT ClusterContext {
44  public:
45   ~ClusterContext();
46   DISABLE_COPY_AND_ASSIGN(ClusterContext)
47   static std::shared_ptr<ClusterContext> instance();
48 
49   // Initialize the cluster configuration and build network.
50   bool Initialize();
51 
52   // Finalize the cluster and process exits. If timeout is set to UINT32_MAX, this method will block without timeout.
53   bool Finalize(uint32_t timeout = kDefaultFinishTimeout);
54 
55   // Return whether this node is the scheduler node.
56   // In a cluster, the scheduler node is special because it's responsible for building network.
57   bool IsScheduler();
58 
59   // Return node object of this process.
60   const std::shared_ptr<topology::NodeBase> &node() const;
61 
62   // Return the shadow node.
63   const std::shared_ptr<topology::NodeBase> &node_base() const;
64 
65   // Return node role in this cluster.
66   const std::string &node_role() const;
67 
68   // Returns total number of the specified node role. This is used as the group size of this node role.
69   uint32_t node_num(const std::string &node_role);
70 
71   // Returns the total number of various role nodes.
72   uint32_t node_num() const;
73 
74   // Return cluster is initialized.
75   bool initialized() const;
76 
77   // Return actor route proxy for AbstractNode.
78   const std::shared_ptr<ActorRouteTableProxy> &actor_route_table_proxy() const;
79 
80   // Get and set whether this process exits with exception.
81   void set_cluster_exit_with_exception();
82   bool cluster_exit_with_exception() const;
83 
84   // Return server range of this node.
port_range()85   const std::pair<uint32_t, uint32_t> &port_range() const { return port_range_; }
86 
87  private:
88   ClusterContext();
89 
90   // This initializing cluster configurations. They can be exported by environment variables, set by python API or
91   // configuration file.
92   void InitClusterConfig();
93 
94   // Build the cluster with other processes. This method will not return until the networking is done.
95   bool BuildCluster();
96 
97   // Load the cluster configuration like worker number, server number and etc.
98   void InitNodeRole();
99   void InitSchedulerIp();
100   void InitSchedulerPort();
101 
102   // After cluster is successfully built, some post process should be done. For example, port range assignment and
103   // client ip set, etc.
104   void PostProcess();
105 
106   // The flag that whether this cluster context instance is already initialized.
107   std::atomic_bool inited_;
108 
109   // The flag that whether this cluster context instance is already finalized.
110   std::atomic_bool finalized_;
111 
112   // The mutex about exiting status of this node.
113   std::mutex finish_mutex_;
114 
115   // Whether the process in this cluster exits with any python exception.
116   bool cluster_exit_with_exception_;
117 
118   // Node role to role number map.
119   std::map<std::string, uint32_t> node_num_each_role_;
120 
121   // Scheduler information.
122   std::string scheduler_host_;
123   uint16_t scheduler_port_;
124 
125   // The compute graph node or meta server node according to the configuration of this process.
126   std::shared_ptr<topology::NodeBase> node_base_;
127 
128   // Node id of this process in the cluster.
129   std::string node_id_;
130 
131   // The role of this process in the cluster.
132   std::string node_role_;
133 
134   // The configuration of this cluster.
135   std::unique_ptr<ps::core::ClusterConfig> cluster_config_;
136 
137   // The actor route table proxy. It only created in abstract nodes because scheduler does not use proxy.
138   std::shared_ptr<ActorRouteTableProxy> actor_route_table_proxy_;
139 
140   std::pair<uint32_t, uint32_t> port_range_;
141 };
142 }  // namespace cluster
143 }  // namespace distributed
144 }  // namespace mindspore
145 #endif  // MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_
146