1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_PS_CONSTANTS_H_ 18 #define MINDSPORE_CCSRC_PS_CONSTANTS_H_ 19 20 #include <limits.h> 21 22 #include <iostream> 23 #include <vector> 24 #include <memory> 25 #include <map> 26 #include <string> 27 #include <functional> 28 29 #include "ps/core/communicator/request_process_result_code.h" 30 31 namespace mindspore { 32 namespace ps { 33 constexpr char kEnvCommType[] = "MS_COMM_TYPE"; 34 constexpr char kEnvInterface[] = "MS_INTERFACE"; 35 constexpr char kEnvPServerNum[] = "MS_SERVER_NUM"; 36 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM"; 37 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST"; 38 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT"; 39 constexpr char kEnvSchedulerManagePort[] = "MS_SCHED_MANAGE_PORT"; 40 constexpr char kEnvNodeId[] = "MS_NODE_ID"; 41 42 constexpr char kCommTypeOfIBVerbs[] = "ibverbs"; 43 constexpr char kRoleOfPServer[] = "server"; 44 constexpr char kRoleOfWorker[] = "worker"; 45 constexpr char kRoleOfScheduler[] = "scheduler"; 46 47 constexpr char kLearningRate[] = "learning_rate"; 48 constexpr char kMomentum[] = "momentum"; 49 50 constexpr char kApplyMomentum[] = "ApplyMomentum"; 51 constexpr char kSparseAdam[] = "Adam"; 52 constexpr char kSparseLazyAdam[] = "LazyAdam"; 53 constexpr char kSparseFtrl[] = "Ftrl"; 54 constexpr char kApplyMomentumOp[] = "Momentum"; 55 constexpr char kSparseAdamOp[] = "Adam"; 56 constexpr char kSparseLazyAdamOp[] = "LazyAdam"; 57 constexpr char kSparseFtrlOp[] = "FTRL"; 58 59 constexpr char kCertificateChain[] = "server.crt"; 60 constexpr char kPrivateKey[] = "server.key"; 61 constexpr char kCAcrt[] = "ca.crt"; 62 63 constexpr int64_t kInitWeightsCmd = 10; 64 constexpr int64_t kInitWeightToOptimIdCmd = 11; 65 constexpr int64_t kInitOptimInputsShapeCmd = 12; 66 constexpr int64_t kInitKeyToPushNodeIdCmd = 13; 67 constexpr int64_t kInitEmbeddingsCmd = 20; 68 constexpr int64_t kUpdateEmbeddingsCmd = 21; 69 constexpr int64_t kCheckReadyForPushCmd = 25; 70 constexpr int64_t kCheckReadyForPullCmd = 26; 71 constexpr int64_t kEmbeddingLookupCmd = 30; 72 constexpr int64_t kFinalizeCmd = 40; 73 constexpr int64_t kPushCmd = 50; 74 constexpr int64_t kPullCmd = 51; 75 76 constexpr size_t kInvalidKey = UINT64_MAX; 77 constexpr int64_t kInvalidID = -1; 78 79 constexpr int64_t kGradIndex = 0; 80 constexpr int64_t kIndiceIndex = 1; 81 constexpr int64_t kFirstDimSize = 2; 82 constexpr int64_t kOutDimSize = 3; 83 84 constexpr int64_t kBase = 10; 85 constexpr float kStdDev = 0.01; 86 87 constexpr int64_t kSparseLazyAdamIndex = 2; 88 constexpr int64_t kSparseFtrlIndex = 3; 89 constexpr int64_t kSparseGradIndex = 6; 90 constexpr int64_t kSparseIndiceIndex = 7; 91 92 constexpr int64_t kHeartbeatTimes = 2; 93 constexpr int64_t kGradValue = -100; 94 95 constexpr uint32_t kMaxMessageSize = static_cast<uint32_t>(100 * (uint32_t(1) << 20)); 96 constexpr char kServerNum[] = "server_num"; 97 constexpr char kWorkerNum[] = "worker_num"; 98 constexpr char kNodesIds[] = "node_ids"; 99 constexpr char kNodeId[] = "node_id"; 100 101 constexpr int64_t kSubmitTaskIntervalInMs = 1; 102 constexpr int64_t kMaxTaskNum = 10240; 103 constexpr int64_t kSubmitTimeOutInMs = 30000; 104 constexpr int64_t kRetryCount = 60; 105 constexpr int64_t kRetryIntervalInMs = 10; 106 107 constexpr int64_t kThreadNum = 32; 108 109 // The timeout period for the scale in node to send the finish message to scheduler. 110 constexpr uint32_t kScaleInTimeoutInSenconds = 30; 111 // The number of retries to determine whether all nodes are successfully registered. 112 constexpr uint32_t kCheckRegisteredRetryCount = 30; 113 // The timeout interval for judging whether all nodes are successfully registered. 114 constexpr uint32_t kCheckRegisteredIntervalInMs = 1000; 115 116 // The type of persistent storage, currently only supports file storage. 117 constexpr char kStoreType[] = "storage_type"; 118 // The file used to storage metadata. 119 constexpr char kStoreFilePath[] = "storage_file_path"; 120 // 1 indicates that the persistent storage type is file. 121 constexpr char kFileStorage[] = "1"; 122 // The recovery key of json_config. 123 constexpr char kKeyRecovery[] = "recovery"; 124 constexpr char kRecoveryWorkerNum[] = "worker_num"; 125 constexpr char kRecoveryServerNum[] = "server_num"; 126 constexpr char kRecoverySchedulerIp[] = "scheduler_ip"; 127 constexpr char kRecoverySchedulerPort[] = "scheduler_port"; 128 129 constexpr char kServerCertPath[] = "server_cert_path"; 130 constexpr char kServerPassword[] = "server_password"; 131 constexpr char kCrlPath[] = "crl_path"; 132 constexpr char kClientCertPath[] = "client_cert_path"; 133 constexpr char kClientPassword[] = "client_password"; 134 constexpr char kCaCertPath[] = "ca_cert_path"; 135 136 constexpr char kCipherList[] = "cipher_list"; 137 constexpr char kCertCheckInterval[] = "cert_check_interval_in_hour"; 138 // 7 * 24 139 constexpr int64_t kCertCheckIntervalInHour = 168; 140 constexpr char kCertExpireWarningTime[] = "cert_expire_warning_time_in_day"; 141 // 90 142 constexpr int64_t kCertExpireWarningTimeInDay = 90; 143 constexpr char kConnectionNum[] = "connection_num"; 144 constexpr int64_t kConnectionNumDefault = 10000; 145 constexpr char kLocalIp[] = "127.0.0.1"; 146 147 constexpr int64_t kJanuary = 1; 148 constexpr int64_t kSeventyYear = 70; 149 constexpr int64_t kHundredYear = 100; 150 constexpr int64_t kThousandYear = 1000; 151 constexpr int64_t kBaseYear = 1900; 152 constexpr int64_t kMinWarningTime = 7; 153 constexpr int64_t kMaxWarningTime = 180; 154 155 constexpr int64_t kLength = 100; 156 constexpr int64_t kMaxPort = 65535; 157 158 constexpr char kTcpCommunicator[] = "TCP"; 159 constexpr char kHttpCommunicator[] = "HTTP"; 160 161 constexpr char kServerCert[] = "server.p12"; 162 constexpr char kClientCert[] = "client.p12"; 163 constexpr char kCaCert[] = "ca.crt"; 164 constexpr char kColon = ':'; 165 const std::map<std::string, size_t> kCiphers = {{"ECDHE-RSA-AES128-GCM-SHA256", 0}, 166 {"ECDHE-ECDSA-AES128-GCM-SHA256", 1}, 167 {"ECDHE-RSA-AES256-GCM-SHA384", 2}, 168 {"ECDHE-ECDSA-AES256-GCM-SHA384", 3}, 169 {"DHE-RSA-AES128-GCM-SHA256", 4}, 170 {"DHE-DSS-AES128-GCM-SHA256", 5}, 171 {"ECDHE-RSA-AES128-SHA256", 6}, 172 {"ECDHE-ECDSA-AES128-SHA256", 7}, 173 {"ECDHE-RSA-AES128-SHA", 8}, 174 {"ECDHE-ECDSA-AES128-SHA", 9}, 175 {"ECDHE-RSA-AES256-SHA384", 10}, 176 {"ECDHE-ECDSA-AES256-SHA384", 11}, 177 {"ECDHE-RSA-AES256-SHA", 12}, 178 {"ECDHE-ECDSA-AES256-SHA", 13}, 179 {"DHE-RSA-AES128-SHA256", 14}, 180 {"DHE-RSA-AES128-SHA", 15}, 181 {"DHE-DSS-AES128-SHA256", 16}, 182 {"DHE-RSA-AES256-SHA256", 17}, 183 {"DHE-DSS-AES256-SHA", 18}, 184 {"DHE-RSA-AES256-SHA", 19}, 185 {"!aNULL", 20}, 186 {"!eNULL", 21}, 187 {"!EXPORT", 22}, 188 {"!DES", 23}, 189 {"!RC4", 24}, 190 {"!3DES", 25}, 191 {"!MD5", 26}, 192 {"!PSK", 27}, 193 {"kEDH+AESGCM", 28}}; 194 195 using DataPtr = std::shared_ptr<unsigned char[]>; 196 using VectorPtr = std::shared_ptr<std::vector<unsigned char>>; 197 using Key = uint64_t; 198 using Keys = std::vector<Key>; 199 using Values = std::vector<float>; 200 using ValuesPtr = std::shared_ptr<Values>; 201 using Weight = std::vector<float>; 202 using Grad = std::vector<float>; 203 using LookupIds = std::vector<Key>; 204 using Lengths = std::vector<int>; 205 using WeightPtr = std::shared_ptr<Weight>; 206 using GradPtr = std::shared_ptr<Grad>; 207 using InputsShape = std::vector<std::shared_ptr<std::vector<size_t>>>; 208 using InputsShapePtr = std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>>; 209 210 constexpr size_t INDEX_NOT_SEND = UINT_MAX; 211 using OptimOriginIdx = std::map<std::string, size_t>; 212 using OptimPSSendIdx = std::map<std::string, size_t>; 213 214 using EventCallback = std::function<void(void)>; 215 216 const OptimOriginIdx kMomentumOriginIdx = {{"weight", 0}, {"accum", 1}, {"lr", 2}, {"grad", 3}, {"momentum", 4}}; 217 const OptimPSSendIdx kMomentumPSSendIdx = { 218 {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"lr", 0}, {"grad", 1}, {"momentum", 2}}; 219 220 const OptimOriginIdx kSparseAdamOriginIdx = {{"weight", 0}, {"m", 1}, {"v", 2}, {"beta1_power", 3}, 221 {"beta2_power", 4}, {"lr", 5}, {"beta1", 6}, {"beta2", 7}, 222 {"eps", 8}, {"grad", 9}, {"indices", 10}}; 223 const OptimPSSendIdx kSparseAdamPSSendIdx = {{"weight", INDEX_NOT_SEND}, 224 {"m", INDEX_NOT_SEND}, 225 {"v", INDEX_NOT_SEND}, 226 {"beta1_power", 0}, 227 {"beta2_power", 1}, 228 {"lr", 2}, 229 {"beta1", 3}, 230 {"beta2", 4}, 231 {"eps", 5}, 232 {"grad", 6}, 233 {"indices", 7}}; 234 235 const OptimOriginIdx kSparseFtrlOriginIdx = {{"weight", 0}, {"accum", 1}, {"linear", 2}, {"grad", 3}, {"indices", 4}}; 236 const OptimPSSendIdx kSparseFtrlPSSendIdx = { 237 {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"linear", INDEX_NOT_SEND}, {"grad", 0}, {"indices", 1}}; 238 239 const std::map<std::string, OptimOriginIdx> kOptimToOriginIdx = {{kApplyMomentum, kMomentumOriginIdx}, 240 {kSparseAdam, kSparseAdamOriginIdx}, 241 {kSparseLazyAdam, kSparseAdamOriginIdx}, 242 {kSparseFtrl, kSparseFtrlOriginIdx}}; 243 const std::map<std::string, OptimOriginIdx> kOptimToPSSendIdx = {{kApplyMomentum, kMomentumPSSendIdx}, 244 {kSparseAdam, kSparseAdamPSSendIdx}, 245 {kSparseLazyAdam, kSparseAdamPSSendIdx}, 246 {kSparseFtrl, kSparseFtrlPSSendIdx}}; 247 248 // The barrier function which should be called before doing scaling out/in operations. 249 // It's easy for us to scale out/in nodes after one iteration is completed and keep consistent. 250 using BarrierBeforeScaleOut = std::function<void(void)>; 251 using BarrierBeforeScaleIn = std::function<void(void)>; 252 253 // These handlers helps worker/server node to reinitialize or recover data after scaling out/in operation of scheduler 254 // is done. 255 using HandlerAfterScaleOut = std::function<void(void)>; 256 using HandlerAfterScaleIn = std::function<void(void)>; 257 258 constexpr char kClusterSafeMode[] = "The cluster is in safemode."; 259 constexpr char kJobNotAvailable[] = "The server's training job is disabled or finished."; 260 261 enum class CustomEvent { kIterationRunning = 0, kIterationCompleted }; 262 263 #define EXC_IF_VEC_IDX_OOB(vec, idx) \ 264 { \ 265 size_t vec_size = vec.size(); \ 266 if (idx >= vec_size) { \ 267 MS_LOG(EXCEPTION) << "Vector " << #vec << " size is " << vec_size << ". So index " << idx \ 268 << " is out of bound."; \ 269 } \ 270 } 271 272 #define ERROR_STATUS(result, code, message) \ 273 MS_LOG(ERROR) << message; \ 274 result = RequestProcessResult(code, message) 275 276 #define CHECK_RETURN_TYPE(_condition) \ 277 do { \ 278 if (!(_condition)) { \ 279 MS_LOG(ERROR) << "Parse protobuf message failed."; \ 280 } \ 281 } while (false) 282 } // namespace ps 283 } // namespace mindspore 284 #endif // MINDSPORE_CCSRC_PS_CONSTANTS_H_ 285