1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_PS_CONSTANTS_H_ 18 #define MINDSPORE_CCSRC_PS_CONSTANTS_H_ 19 20 #include <limits.h> 21 22 #include <iostream> 23 #include <vector> 24 #include <memory> 25 #include <map> 26 #include <string> 27 #include <functional> 28 #include "utils/shape_utils.h" 29 30 namespace mindspore { 31 namespace distributed::persistent { 32 template <typename T> 33 class Data; 34 template <typename T> 35 class PersistentData; 36 } // namespace distributed::persistent 37 namespace ps { 38 constexpr char kEnvCommType[] = "MS_COMM_TYPE"; 39 constexpr char kEnvInterface[] = "MS_INTERFACE"; 40 constexpr char kEnvPServerNum[] = "MS_SERVER_NUM"; 41 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM"; 42 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST"; 43 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT"; 44 constexpr char kEnvSchedulerManagePort[] = "MS_SCHED_MANAGE_PORT"; 45 constexpr char kEnvNodeId[] = "MS_NODE_ID"; 46 47 constexpr char kCommTypeOfIBVerbs[] = "ibverbs"; 48 constexpr char kRoleOfPServer[] = "server"; 49 constexpr char kRoleOfWorker[] = "worker"; 50 constexpr char kRoleOfScheduler[] = "scheduler"; 51 52 constexpr char kLearningRate[] = "learning_rate"; 53 constexpr char kMomentum[] = "momentum"; 54 55 constexpr char kApplyMomentum[] = "ApplyMomentum"; 56 constexpr char kSparseAdam[] = "Adam"; 57 constexpr char kSparseLazyAdam[] = "LazyAdam"; 58 constexpr char kSparseFtrl[] = "Ftrl"; 59 constexpr char kApplyMomentumOp[] = "Momentum"; 60 constexpr char kSparseAdamOp[] = "Adam"; 61 constexpr char kSparseLazyAdamOp[] = "LazyAdam"; 62 constexpr char kSparseFtrlOp[] = "FTRL"; 63 64 constexpr char kCertificateChain[] = "server.crt"; 65 constexpr char kPrivateKey[] = "server.key"; 66 constexpr char kCAcrt[] = "ca.crt"; 67 68 constexpr char kKeys[] = "keys"; 69 constexpr char kShapes[] = "shapes"; 70 constexpr char kParamNames[] = "param_names"; 71 constexpr char kRecoverFunc[] = "recover_function"; 72 constexpr char kRecoverEmbedding[] = "RecoverEmbedding"; 73 constexpr char kCurrentDirOfServer[] = "./server_"; 74 constexpr char kParamWithKey[] = "_parameter_key_"; 75 76 constexpr int64_t kInitWeightsCmd = 10; 77 constexpr int64_t kInitWeightToOptimIdCmd = 11; 78 constexpr int64_t kInitOptimInputsShapeCmd = 12; 79 constexpr int64_t kInitKeyToPushNodeIdCmd = 13; 80 constexpr int64_t kInitEmbeddingsCmd = 20; 81 constexpr int64_t kUpdateEmbeddingsCmd = 21; 82 constexpr int64_t kCheckReadyForPushCmd = 25; 83 constexpr int64_t kCheckReadyForPullCmd = 26; 84 constexpr int64_t kEmbeddingLookupCmd = 30; 85 constexpr int64_t kFinalizeCmd = 40; 86 constexpr int64_t kPushCmd = 50; 87 constexpr int64_t kPullCmd = 51; 88 89 constexpr size_t kInvalidKey = UINT64_MAX; 90 constexpr int64_t kInvalidID = -1; 91 92 constexpr uint32_t kMaxMessageSize = static_cast<uint32_t>(100 * (uint32_t(1) << 20)); 93 constexpr char kServerNum[] = "server_num"; 94 constexpr char kWorkerNum[] = "worker_num"; 95 constexpr char kNodesIds[] = "node_ids"; 96 constexpr char kNodeId[] = "node_id"; 97 98 constexpr char kSuccessCode[] = "0"; 99 constexpr char kErrorCode[] = "1"; 100 101 constexpr int64_t kSubmitTaskIntervalInMs = 1; 102 constexpr int64_t kMaxTaskNum = 10240; 103 constexpr int64_t kSubmitTimeOutInMs = 30000; 104 constexpr int64_t kRetryCount = 60; 105 constexpr int64_t kRetryIntervalInMs = 10; 106 107 constexpr int64_t kThreadNum = 32; 108 constexpr int64_t kGradIndex = 0; 109 constexpr int64_t kIndiceIndex = 1; 110 constexpr int64_t kFirstDimSize = 2; 111 constexpr int64_t kOutDimSize = 3; 112 113 constexpr int64_t kBase = 10; 114 constexpr float kStdDev = 0.01; 115 // The timeout period for the scale in node to send the finish message to scheduler. 116 constexpr uint32_t kScaleInTimeoutInSenconds = 30; 117 // The number of retries to determine whether all nodes are successfully registered. 118 constexpr uint32_t kCheckRegisteredRetryCount = 30; 119 // The timeout interval for judging whether all nodes are successfully registered. 120 constexpr uint32_t kCheckRegisteredIntervalInMs = 1000; 121 122 constexpr int64_t kSparseLazyAdamIndex = 2; 123 constexpr int64_t kSparseFtrlIndex = 3; 124 constexpr int64_t kSparseGradIndex = 6; 125 constexpr int64_t kSparseIndiceIndex = 7; 126 127 constexpr int64_t kHeartbeatTimes = 2; 128 constexpr int64_t kGradValue = -100; 129 // Whether to support recovery. 130 constexpr char kIsRecovery[] = "is_recovery"; 131 // The type of persistent storage, currently only supports file storage. 132 constexpr char kStoreType[] = "storage_type"; 133 // The file used to storage metadata. 134 constexpr char kStoreFilePath[] = "storage_file_path"; 135 // The file used to storage scheduler metadata. 136 constexpr char kSchedulerStoreFilePath[] = "scheduler_storage_file_path"; 137 // 1 indicates that the persistent storage type is file. 138 constexpr char kFileStorage[] = "1"; 139 // The recovery key of json_config. 140 constexpr char kKeyRecovery[] = "recovery"; 141 constexpr char kRecoveryWorkerNum[] = "worker_num"; 142 constexpr char kRecoveryServerNum[] = "server_num"; 143 constexpr char kRecoverySchedulerIp[] = "scheduler_ip"; 144 constexpr char kRecoverySchedulerPort[] = "scheduler_port"; 145 constexpr char kRecoveryTotalNodeNum[] = "total_node_num"; 146 constexpr char kRecoveryNextWorkerRankId[] = "next_worker_rank_id"; 147 constexpr char kRecoveryNextServerRankId[] = "next_server_rank_id"; 148 constexpr char kRecoveryRegisteredNodesInfos[] = "node_ids"; 149 constexpr char kRecoveryClusterState[] = "cluster_state"; 150 151 constexpr char kServerCertPath[] = "server_cert_path"; 152 constexpr char kServerPassword[] = "server_password"; 153 constexpr char kCrlPath[] = "crl_path"; 154 constexpr char kClientCertPath[] = "client_cert_path"; 155 constexpr char kClientPassword[] = "client_password"; 156 constexpr char kCaCertPath[] = "ca_cert_path"; 157 constexpr char kCipherList[] = "cipher_list"; 158 constexpr char kCertCheckInterval[] = "cert_check_interval_in_hour"; 159 // 7 * 24 160 constexpr int64_t kCertCheckIntervalInHour = 168; 161 constexpr char kCertExpireWarningTime[] = "cert_expire_warning_time_in_day"; 162 // 90 163 constexpr int64_t kCertExpireWarningTimeInDay = 90; 164 constexpr char kConnectionNum[] = "connection_num"; 165 constexpr int64_t kConnectionNumDefault = 10000; 166 constexpr char kLocalIp[] = "127.0.0.1"; 167 168 constexpr int64_t kJanuary = 1; 169 constexpr int64_t kSeventyYear = 70; 170 constexpr int64_t kHundredYear = 100; 171 constexpr int64_t kThousandYear = 1000; 172 constexpr int64_t kBaseYear = 1900; 173 constexpr int64_t kMinWarningTime = 7; 174 constexpr int64_t kMaxWarningTime = 180; 175 176 constexpr int64_t kLength = 100; 177 constexpr int64_t kMaxPort = 65535; 178 constexpr int64_t kSecurityLevel = 3; 179 180 constexpr char kTcpCommunicator[] = "TCP"; 181 constexpr char kHttpCommunicator[] = "HTTP"; 182 183 constexpr char kServerCert[] = "server.p12"; 184 constexpr char kClientCert[] = "client.p12"; 185 constexpr char kCaCert[] = "ca.crt"; 186 constexpr char kColon = ':'; 187 const std::map<std::string, size_t> kCiphers = { 188 {"ECDHE-RSA-AES128-GCM-SHA256", 0}, {"ECDHE-ECDSA-AES128-GCM-SHA256", 1}, {"ECDHE-RSA-AES256-GCM-SHA384", 2}, 189 {"ECDHE-ECDSA-AES256-GCM-SHA384", 3}, {"ECDHE-RSA-CHACHA20-POLY1305", 4}, {"ECDHE-PSK-CHACHA20-POLY1305", 5}, 190 {"ECDHE-ECDSA-AES128-CCM", 6}, {"ECDHE-ECDSA-AES256-CCM", 7}, {"ECDHE-ECDSA-CHACHA20-POLY1305", 8}}; 191 192 using DataPtr = std::unique_ptr<uint8_t[]>; 193 using VectorPtr = std::shared_ptr<std::vector<unsigned char>>; 194 using Key = size_t; 195 using Keys = std::vector<Key>; 196 using Values = std::vector<float>; 197 using ValuesPtr = std::shared_ptr<Values>; 198 using Weight = distributed::persistent::Data<float>; 199 using PersistentWeight = distributed::persistent::PersistentData<float>; 200 using Grad = std::vector<float>; 201 using LookupIds = std::vector<Key>; 202 using Lengths = std::vector<int>; 203 using WeightPtr = std::shared_ptr<Weight>; 204 using PersistentWeightPtr = std::shared_ptr<PersistentWeight>; 205 using GradPtr = std::shared_ptr<Grad>; 206 using InputsShape = std::vector<std::shared_ptr<ShapeVector>>; 207 using InputsShapePtr = std::shared_ptr<std::vector<std::shared_ptr<ShapeVector>>>; 208 209 constexpr size_t INDEX_NOT_SEND = UINT_MAX; 210 using OptimOriginIdx = std::map<std::string, size_t>; 211 using OptimPSSendIdx = std::map<std::string, size_t>; 212 213 using EventCallback = std::function<void(void)>; 214 215 const OptimOriginIdx kMomentumOriginIdx = {{"weight", 0}, {"accum", 1}, {"lr", 2}, {"grad", 3}, {"momentum", 4}}; 216 const OptimPSSendIdx kMomentumPSSendIdx = { 217 {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"lr", 0}, {"grad", 1}, {"momentum", 2}}; 218 219 const OptimOriginIdx kSparseAdamOriginIdx = {{"weight", 0}, {"m", 1}, {"v", 2}, {"beta1_power", 3}, 220 {"beta2_power", 4}, {"lr", 5}, {"beta1", 6}, {"beta2", 7}, 221 {"eps", 8}, {"grad", 9}, {"indices", 10}}; 222 const OptimPSSendIdx kSparseAdamPSSendIdx = {{"weight", INDEX_NOT_SEND}, 223 {"m", INDEX_NOT_SEND}, 224 {"v", INDEX_NOT_SEND}, 225 {"beta1_power", 0}, 226 {"beta2_power", 1}, 227 {"lr", 2}, 228 {"beta1", 3}, 229 {"beta2", 4}, 230 {"eps", 5}, 231 {"grad", 6}, 232 {"indices", 7}}; 233 234 const OptimOriginIdx kSparseFtrlOriginIdx = {{"weight", 0}, {"accum", 1}, {"linear", 2}, {"grad", 3}, {"indices", 4}}; 235 const OptimPSSendIdx kSparseFtrlPSSendIdx = { 236 {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"linear", INDEX_NOT_SEND}, {"grad", 0}, {"indices", 1}}; 237 238 const std::map<std::string, OptimOriginIdx> kOptimToOriginIdx = {{kApplyMomentum, kMomentumOriginIdx}, 239 {kSparseAdam, kSparseAdamOriginIdx}, 240 {kSparseLazyAdam, kSparseAdamOriginIdx}, 241 {kSparseFtrl, kSparseFtrlOriginIdx}}; 242 const std::map<std::string, OptimOriginIdx> kOptimToPSSendIdx = {{kApplyMomentum, kMomentumPSSendIdx}, 243 {kSparseAdam, kSparseAdamPSSendIdx}, 244 {kSparseLazyAdam, kSparseAdamPSSendIdx}, 245 {kSparseFtrl, kSparseFtrlPSSendIdx}}; 246 247 // The barrier function which should be called before doing scaling out/in operations. 248 // It's easy for us to scale out/in nodes after one iteration is completed and keep consistent. 249 using BarrierBeforeScaleOut = std::function<void(void)>; 250 using BarrierBeforeScaleIn = std::function<void(void)>; 251 252 // These handlers helps worker/server node to reinitialize or recover data after scaling out/in operation of scheduler 253 // is done. 254 using HandlerAfterScaleOut = std::function<void(void)>; 255 using HandlerAfterScaleIn = std::function<void(void)>; 256 using HandlerAfterScaleOutRollback = std::function<void(void)>; 257 258 constexpr char kClusterNotReady[] = 259 "The Scheduler's connections are not equal with total node num, Maybe this is because some server nodes are drop " 260 "out or scale in nodes has not been recycled."; 261 constexpr char kJobNotReady[] = "The server's training job is not ready."; 262 constexpr char kClusterSafeMode[] = "The cluster is in safemode."; 263 constexpr char kJobNotAvailable[] = "The server's training job is disabled or finished."; 264 265 enum class UserDefineEvent { kIterationRunning = 0, kIterationCompleted, kNodeTimeout }; 266 267 #define EXC_IF_VEC_IDX_OOB(vec, idx) \ 268 do { \ 269 size_t vec_size = vec.size(); \ 270 if (idx >= vec_size) { \ 271 MS_LOG(EXCEPTION) << "Vector " << #vec << " size is " << vec_size << ". So index " << idx \ 272 << " is out of bound."; \ 273 } \ 274 } while (0) 275 276 #define ERROR_STATUS(result, code, message) \ 277 do { \ 278 MS_LOG(ERROR) << message; \ 279 result = RequestProcessResult(code, message); \ 280 } while (0) 281 282 #define CHECK_RETURN_TYPE(_condition) \ 283 do { \ 284 if (!(_condition)) { \ 285 MS_LOG(ERROR) << "Parse protobuf message failed."; \ 286 } \ 287 } while (false) 288 } // namespace ps 289 } // namespace mindspore 290 #endif // MINDSPORE_CCSRC_PS_CONSTANTS_H_ 291