• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_
18 #define MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_
19 
20 #include <string>
21 #include <chrono>
22 
23 namespace mindspore {
24 namespace distributed {
25 namespace cluster {
26 namespace topology {
27 // Indicates the state of the cluster physical topology.
28 enum class TopoState {
29   // All the nodes of this cluster are in the process of starting up.
30   kInitializing = 0,
31 
32   // All the nodes of this cluster has been started and registered to the meta server node successfully.
33   kInitialized,
34 
35   // The topo of this cluster failed to construct at specified time.
36   kFailed,
37 
38   // All the nodes of this cluster have finished their tasks and unregistered successfully.
39   kFinished
40 };
41 
42 // The address of meta server node used by compute graph nodes to register and get addresses of other compute graph
43 // nodes dynamically.
44 struct MetaServerAddress {
GetUrlMetaServerAddress45   std::string GetUrl() { return ip + ":" + std::to_string(port); }
46   std::string ip;
47   int port{-1};
48 };
49 
50 // The address of meta server node.
51 // This address is set or obtained through environment variables.
52 constexpr char kEnvMetaServerHost[] = "MS_SCHED_HOST";
53 constexpr char kEnvMetaServerPort[] = "MS_SCHED_PORT";
54 
55 constexpr char kEnvNodeId[] = "MS_NODE_ID";
56 
57 // The key of compute graph node's hostname metadata stored in meta server.
58 constexpr char kHostNames[] = "hostnames";
59 
60 // For port number conversion.
61 static const int kDecimal = 10;
62 
63 // All kinds of messages sent between compute graph nodes and meta server node.
64 enum class MessageName {
65   kRegistration,
66   kUnregistration,
67   kHeartbeat,
68   kSuccess,
69   kInvalidNode,
70   kUninitTopo,
71   kWriteMetadata,
72   kReadMetadata,
73   kDeleteMetadata,
74   kGetHostNames,
75   kValidMetadata,
76   kInvalidMetadata
77 };
78 
79 // The retry and interval configuration used for the macro `EXECUTE_WITH_RETRY`.
80 static const size_t kExecuteRetryNum = 210;
81 // The retry number of cgn and msn for reconnecting.
82 static const size_t kCgnExecuteRetryNum = 210;
83 static const size_t kMsnExecuteRetryNum = 210;
84 static const size_t kNoRetry = 1;
85 static const uint32_t kExecuteInterval = 3;
86 
87 // Cluster building time out window in second. Default: 30 minutes.
88 constexpr char kEnvTopoTimeOut[] = "MS_TOPO_TIMEOUT";
89 static const size_t kDefaultTopoTimeOut = 30 * 60;
90 
91 // The timeout(second) window for heartbeat from compute graph node to meta server. Default: 300 seconds.
92 constexpr char kEnvNodeTimeOut[] = "MS_NODE_TIMEOUT";
93 static const size_t kDefaultNodeTimeout = 300;
94 
95 constexpr char kEnvRetryIntervalLower[] = "MS_RETRY_INTERVAL_LOWER";
96 static const size_t kDefaultRetryInterLower = 3;
97 
98 constexpr char kEnvRetryIntervalUpper[] = "MS_RETRY_INTERVAL_UPPER";
99 static const size_t kDefaultRetryInterUpper = 5;
100 
101 #define EXECUTE_WITH_RETRY(func, retry, interval, err_msg)                     \
102   do {                                                                         \
103     bool success = false;                                                      \
104     for (size_t i = 1; i <= retry; ++i) {                                      \
105       success = func();                                                        \
106       if (!success) {                                                          \
107         MS_LOG(WARNING) << err_msg << ", retry(" << i << "/" << retry << ")."; \
108         (void)sleep(interval);                                                 \
109       } else {                                                                 \
110         break;                                                                 \
111       }                                                                        \
112     }                                                                          \
113     if (!success) {                                                            \
114       return false;                                                            \
115     }                                                                          \
116   } while (false)
117 
118 #define EXECUTE_WITH_TIMEOUT(func, interval, err_msg, success, time) \
119   do {                                                               \
120     success = false;                                                 \
121     while (!success) {                                               \
122       success = func;                                                \
123       if (!success) {                                                \
124         MS_LOG(WARNING) << err_msg << ", retry...";                  \
125         (void)sleep(interval);                                       \
126         if ((time - interval) < 0) break;                            \
127         time -= interval;                                            \
128       } else {                                                       \
129         break;                                                       \
130       }                                                              \
131     }                                                                \
132     if (!success && time <= 0) {                                     \
133       MS_LOG(ERROR) << err_msg;                                      \
134       return false;                                                  \
135     }                                                                \
136   } while (false)
137 
138 #define EXECUTE_WITH_EXPECTED(func, expected, interval, err_msg, time) \
139   do {                                                                 \
140     bool success = false;                                              \
141     while (!success) {                                                 \
142       success = (func == expected);                                    \
143       if (!success) {                                                  \
144         MS_LOG(WARNING) << err_msg << ", retry...";                    \
145         (void)sleep(interval);                                         \
146         if ((time - interval) < 0) break;                              \
147         time -= interval;                                              \
148       } else {                                                         \
149         break;                                                         \
150       }                                                                \
151     }                                                                  \
152     if (!success && time <= 0) {                                       \
153       MS_LOG(ERROR) << err_msg;                                        \
154       return false;                                                    \
155     }                                                                  \
156   } while (false)
157 }  // namespace topology
158 }  // namespace cluster
159 }  // namespace distributed
160 }  // namespace mindspore
161 #endif  // MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_
162