1 /** 2 * Copyright 2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_ 18 #define MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_ 19 20 #include <string> 21 #include <chrono> 22 23 namespace mindspore { 24 namespace distributed { 25 namespace cluster { 26 namespace topology { 27 // Indicates the state of the cluster physical topology. 28 enum class TopoState { 29 // All the nodes of this cluster are in the process of starting up. 30 kInitializing = 0, 31 32 // All the nodes of this cluster has been started and registered to the meta server node successfully. 33 kInitialized, 34 35 // The topo of this cluster failed to construct at specified time. 36 kFailed, 37 38 // All the nodes of this cluster have finished their tasks and unregistered successfully. 39 kFinished 40 }; 41 42 // The address of meta server node used by compute graph nodes to register and get addresses of other compute graph 43 // nodes dynamically. 44 struct MetaServerAddress { GetUrlMetaServerAddress45 std::string GetUrl() { return ip + ":" + std::to_string(port); } 46 std::string ip; 47 int port{-1}; 48 }; 49 50 // The address of meta server node. 51 // This address is set or obtained through environment variables. 52 constexpr char kEnvMetaServerHost[] = "MS_SCHED_HOST"; 53 constexpr char kEnvMetaServerPort[] = "MS_SCHED_PORT"; 54 55 constexpr char kEnvNodeId[] = "MS_NODE_ID"; 56 57 // The key of compute graph node's hostname metadata stored in meta server. 58 constexpr char kHostNames[] = "hostnames"; 59 60 // For port number conversion. 61 static const int kDecimal = 10; 62 63 // All kinds of messages sent between compute graph nodes and meta server node. 64 enum class MessageName { 65 kRegistration, 66 kUnregistration, 67 kHeartbeat, 68 kSuccess, 69 kInvalidNode, 70 kUninitTopo, 71 kWriteMetadata, 72 kReadMetadata, 73 kDeleteMetadata, 74 kGetHostNames, 75 kValidMetadata, 76 kInvalidMetadata 77 }; 78 79 // The retry and interval configuration used for the macro `EXECUTE_WITH_RETRY`. 80 static const size_t kExecuteRetryNum = 210; 81 // The retry number of cgn and msn for reconnecting. 82 static const size_t kCgnExecuteRetryNum = 210; 83 static const size_t kMsnExecuteRetryNum = 210; 84 static const size_t kNoRetry = 1; 85 static const uint32_t kExecuteInterval = 3; 86 87 // Cluster building time out window in second. Default: 30 minutes. 88 constexpr char kEnvTopoTimeOut[] = "MS_TOPO_TIMEOUT"; 89 static const size_t kDefaultTopoTimeOut = 30 * 60; 90 91 // The timeout(second) window for heartbeat from compute graph node to meta server. Default: 300 seconds. 92 constexpr char kEnvNodeTimeOut[] = "MS_NODE_TIMEOUT"; 93 static const size_t kDefaultNodeTimeout = 300; 94 95 constexpr char kEnvRetryIntervalLower[] = "MS_RETRY_INTERVAL_LOWER"; 96 static const size_t kDefaultRetryInterLower = 3; 97 98 constexpr char kEnvRetryIntervalUpper[] = "MS_RETRY_INTERVAL_UPPER"; 99 static const size_t kDefaultRetryInterUpper = 5; 100 101 #define EXECUTE_WITH_RETRY(func, retry, interval, err_msg) \ 102 do { \ 103 bool success = false; \ 104 for (size_t i = 1; i <= retry; ++i) { \ 105 success = func(); \ 106 if (!success) { \ 107 MS_LOG(WARNING) << err_msg << ", retry(" << i << "/" << retry << ")."; \ 108 (void)sleep(interval); \ 109 } else { \ 110 break; \ 111 } \ 112 } \ 113 if (!success) { \ 114 return false; \ 115 } \ 116 } while (false) 117 118 #define EXECUTE_WITH_TIMEOUT(func, interval, err_msg, success, time) \ 119 do { \ 120 success = false; \ 121 while (!success) { \ 122 success = func; \ 123 if (!success) { \ 124 MS_LOG(WARNING) << err_msg << ", retry..."; \ 125 (void)sleep(interval); \ 126 if ((time - interval) < 0) break; \ 127 time -= interval; \ 128 } else { \ 129 break; \ 130 } \ 131 } \ 132 if (!success && time <= 0) { \ 133 MS_LOG(ERROR) << err_msg; \ 134 return false; \ 135 } \ 136 } while (false) 137 138 #define EXECUTE_WITH_EXPECTED(func, expected, interval, err_msg, time) \ 139 do { \ 140 bool success = false; \ 141 while (!success) { \ 142 success = (func == expected); \ 143 if (!success) { \ 144 MS_LOG(WARNING) << err_msg << ", retry..."; \ 145 (void)sleep(interval); \ 146 if ((time - interval) < 0) break; \ 147 time -= interval; \ 148 } else { \ 149 break; \ 150 } \ 151 } \ 152 if (!success && time <= 0) { \ 153 MS_LOG(ERROR) << err_msg; \ 154 return false; \ 155 } \ 156 } while (false) 157 } // namespace topology 158 } // namespace cluster 159 } // namespace distributed 160 } // namespace mindspore 161 #endif // MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_TOPOLOGY_COMMON_H_ 162