• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_PS_CONSTANTS_H_
18 #define MINDSPORE_CCSRC_PS_CONSTANTS_H_
19 
20 #include <limits.h>
21 
22 #include <iostream>
23 #include <vector>
24 #include <memory>
25 #include <map>
26 #include <string>
27 #include <functional>
28 #include "utils/shape_utils.h"
29 
30 namespace mindspore {
31 namespace distributed::persistent {
32 template <typename T>
33 class Data;
34 template <typename T>
35 class PersistentData;
36 }  // namespace distributed::persistent
37 namespace ps {
38 constexpr char kEnvCommType[] = "MS_COMM_TYPE";
39 constexpr char kEnvInterface[] = "MS_INTERFACE";
40 constexpr char kEnvPServerNum[] = "MS_SERVER_NUM";
41 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM";
42 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST";
43 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT";
44 constexpr char kEnvSchedulerManagePort[] = "MS_SCHED_MANAGE_PORT";
45 constexpr char kEnvNodeId[] = "MS_NODE_ID";
46 
47 constexpr char kCommTypeOfIBVerbs[] = "ibverbs";
48 constexpr char kRoleOfPServer[] = "server";
49 constexpr char kRoleOfWorker[] = "worker";
50 constexpr char kRoleOfScheduler[] = "scheduler";
51 
52 constexpr char kLearningRate[] = "learning_rate";
53 constexpr char kMomentum[] = "momentum";
54 
55 constexpr char kApplyMomentum[] = "ApplyMomentum";
56 constexpr char kSparseAdam[] = "Adam";
57 constexpr char kSparseLazyAdam[] = "LazyAdam";
58 constexpr char kSparseFtrl[] = "Ftrl";
59 constexpr char kApplyMomentumOp[] = "Momentum";
60 constexpr char kSparseAdamOp[] = "Adam";
61 constexpr char kSparseLazyAdamOp[] = "LazyAdam";
62 constexpr char kSparseFtrlOp[] = "FTRL";
63 
64 constexpr char kCertificateChain[] = "server.crt";
65 constexpr char kPrivateKey[] = "server.key";
66 constexpr char kCAcrt[] = "ca.crt";
67 
68 constexpr char kKeys[] = "keys";
69 constexpr char kShapes[] = "shapes";
70 constexpr char kParamNames[] = "param_names";
71 constexpr char kRecoverFunc[] = "recover_function";
72 constexpr char kRecoverEmbedding[] = "RecoverEmbedding";
73 constexpr char kCurrentDirOfServer[] = "./server_";
74 constexpr char kParamWithKey[] = "_parameter_key_";
75 
76 constexpr int64_t kInitWeightsCmd = 10;
77 constexpr int64_t kInitWeightToOptimIdCmd = 11;
78 constexpr int64_t kInitOptimInputsShapeCmd = 12;
79 constexpr int64_t kInitKeyToPushNodeIdCmd = 13;
80 constexpr int64_t kInitEmbeddingsCmd = 20;
81 constexpr int64_t kUpdateEmbeddingsCmd = 21;
82 constexpr int64_t kCheckReadyForPushCmd = 25;
83 constexpr int64_t kCheckReadyForPullCmd = 26;
84 constexpr int64_t kEmbeddingLookupCmd = 30;
85 constexpr int64_t kFinalizeCmd = 40;
86 constexpr int64_t kPushCmd = 50;
87 constexpr int64_t kPullCmd = 51;
88 
89 constexpr size_t kInvalidKey = UINT64_MAX;
90 constexpr int64_t kInvalidID = -1;
91 
92 constexpr uint32_t kMaxMessageSize = static_cast<uint32_t>(100 * (uint32_t(1) << 20));
93 constexpr char kServerNum[] = "server_num";
94 constexpr char kWorkerNum[] = "worker_num";
95 constexpr char kNodesIds[] = "node_ids";
96 constexpr char kNodeId[] = "node_id";
97 
98 constexpr char kSuccessCode[] = "0";
99 constexpr char kErrorCode[] = "1";
100 
101 constexpr int64_t kSubmitTaskIntervalInMs = 1;
102 constexpr int64_t kMaxTaskNum = 10240;
103 constexpr int64_t kSubmitTimeOutInMs = 30000;
104 constexpr int64_t kRetryCount = 60;
105 constexpr int64_t kRetryIntervalInMs = 10;
106 
107 constexpr int64_t kThreadNum = 32;
108 constexpr int64_t kGradIndex = 0;
109 constexpr int64_t kIndiceIndex = 1;
110 constexpr int64_t kFirstDimSize = 2;
111 constexpr int64_t kOutDimSize = 3;
112 
113 constexpr int64_t kBase = 10;
114 constexpr float kStdDev = 0.01;
115 // The timeout period for the scale in node to send the finish message to scheduler.
116 constexpr uint32_t kScaleInTimeoutInSenconds = 30;
117 // The number of retries to determine whether all nodes are successfully registered.
118 constexpr uint32_t kCheckRegisteredRetryCount = 30;
119 // The timeout interval for judging whether all nodes are successfully registered.
120 constexpr uint32_t kCheckRegisteredIntervalInMs = 1000;
121 
122 constexpr int64_t kSparseLazyAdamIndex = 2;
123 constexpr int64_t kSparseFtrlIndex = 3;
124 constexpr int64_t kSparseGradIndex = 6;
125 constexpr int64_t kSparseIndiceIndex = 7;
126 
127 constexpr int64_t kHeartbeatTimes = 2;
128 constexpr int64_t kGradValue = -100;
129 // Whether to support recovery.
130 constexpr char kIsRecovery[] = "is_recovery";
131 // The type of persistent storage, currently only supports file storage.
132 constexpr char kStoreType[] = "storage_type";
133 // The file used to storage metadata.
134 constexpr char kStoreFilePath[] = "storage_file_path";
135 // The file used to storage scheduler metadata.
136 constexpr char kSchedulerStoreFilePath[] = "scheduler_storage_file_path";
137 // 1 indicates that the persistent storage type is file.
138 constexpr char kFileStorage[] = "1";
139 // The recovery key of json_config.
140 constexpr char kKeyRecovery[] = "recovery";
141 constexpr char kRecoveryWorkerNum[] = "worker_num";
142 constexpr char kRecoveryServerNum[] = "server_num";
143 constexpr char kRecoverySchedulerIp[] = "scheduler_ip";
144 constexpr char kRecoverySchedulerPort[] = "scheduler_port";
145 constexpr char kRecoveryTotalNodeNum[] = "total_node_num";
146 constexpr char kRecoveryNextWorkerRankId[] = "next_worker_rank_id";
147 constexpr char kRecoveryNextServerRankId[] = "next_server_rank_id";
148 constexpr char kRecoveryRegisteredNodesInfos[] = "node_ids";
149 constexpr char kRecoveryClusterState[] = "cluster_state";
150 
151 constexpr char kServerCertPath[] = "server_cert_path";
152 constexpr char kServerPassword[] = "server_password";
153 constexpr char kCrlPath[] = "crl_path";
154 constexpr char kClientCertPath[] = "client_cert_path";
155 constexpr char kClientPassword[] = "client_password";
156 constexpr char kCaCertPath[] = "ca_cert_path";
157 constexpr char kCipherList[] = "cipher_list";
158 constexpr char kCertCheckInterval[] = "cert_check_interval_in_hour";
159 // 7 * 24
160 constexpr int64_t kCertCheckIntervalInHour = 168;
161 constexpr char kCertExpireWarningTime[] = "cert_expire_warning_time_in_day";
162 // 90
163 constexpr int64_t kCertExpireWarningTimeInDay = 90;
164 constexpr char kConnectionNum[] = "connection_num";
165 constexpr int64_t kConnectionNumDefault = 10000;
166 constexpr char kLocalIp[] = "127.0.0.1";
167 
168 constexpr int64_t kJanuary = 1;
169 constexpr int64_t kSeventyYear = 70;
170 constexpr int64_t kHundredYear = 100;
171 constexpr int64_t kThousandYear = 1000;
172 constexpr int64_t kBaseYear = 1900;
173 constexpr int64_t kMinWarningTime = 7;
174 constexpr int64_t kMaxWarningTime = 180;
175 
176 constexpr int64_t kLength = 100;
177 constexpr int64_t kMaxPort = 65535;
178 constexpr int64_t kSecurityLevel = 3;
179 
180 constexpr char kTcpCommunicator[] = "TCP";
181 constexpr char kHttpCommunicator[] = "HTTP";
182 
183 constexpr char kServerCert[] = "server.p12";
184 constexpr char kClientCert[] = "client.p12";
185 constexpr char kCaCert[] = "ca.crt";
186 constexpr char kColon = ':';
187 const std::map<std::string, size_t> kCiphers = {
188   {"ECDHE-RSA-AES128-GCM-SHA256", 0},   {"ECDHE-ECDSA-AES128-GCM-SHA256", 1}, {"ECDHE-RSA-AES256-GCM-SHA384", 2},
189   {"ECDHE-ECDSA-AES256-GCM-SHA384", 3}, {"ECDHE-RSA-CHACHA20-POLY1305", 4},   {"ECDHE-PSK-CHACHA20-POLY1305", 5},
190   {"ECDHE-ECDSA-AES128-CCM", 6},        {"ECDHE-ECDSA-AES256-CCM", 7},        {"ECDHE-ECDSA-CHACHA20-POLY1305", 8}};
191 
192 using DataPtr = std::unique_ptr<uint8_t[]>;
193 using VectorPtr = std::shared_ptr<std::vector<unsigned char>>;
194 using Key = size_t;
195 using Keys = std::vector<Key>;
196 using Values = std::vector<float>;
197 using ValuesPtr = std::shared_ptr<Values>;
198 using Weight = distributed::persistent::Data<float>;
199 using PersistentWeight = distributed::persistent::PersistentData<float>;
200 using Grad = std::vector<float>;
201 using LookupIds = std::vector<Key>;
202 using Lengths = std::vector<int>;
203 using WeightPtr = std::shared_ptr<Weight>;
204 using PersistentWeightPtr = std::shared_ptr<PersistentWeight>;
205 using GradPtr = std::shared_ptr<Grad>;
206 using InputsShape = std::vector<std::shared_ptr<ShapeVector>>;
207 using InputsShapePtr = std::shared_ptr<std::vector<std::shared_ptr<ShapeVector>>>;
208 
209 constexpr size_t INDEX_NOT_SEND = UINT_MAX;
210 using OptimOriginIdx = std::map<std::string, size_t>;
211 using OptimPSSendIdx = std::map<std::string, size_t>;
212 
213 using EventCallback = std::function<void(void)>;
214 
215 const OptimOriginIdx kMomentumOriginIdx = {{"weight", 0}, {"accum", 1}, {"lr", 2}, {"grad", 3}, {"momentum", 4}};
216 const OptimPSSendIdx kMomentumPSSendIdx = {
217   {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"lr", 0}, {"grad", 1}, {"momentum", 2}};
218 
219 const OptimOriginIdx kSparseAdamOriginIdx = {{"weight", 0},      {"m", 1},    {"v", 2},       {"beta1_power", 3},
220                                              {"beta2_power", 4}, {"lr", 5},   {"beta1", 6},   {"beta2", 7},
221                                              {"eps", 8},         {"grad", 9}, {"indices", 10}};
222 const OptimPSSendIdx kSparseAdamPSSendIdx = {{"weight", INDEX_NOT_SEND},
223                                              {"m", INDEX_NOT_SEND},
224                                              {"v", INDEX_NOT_SEND},
225                                              {"beta1_power", 0},
226                                              {"beta2_power", 1},
227                                              {"lr", 2},
228                                              {"beta1", 3},
229                                              {"beta2", 4},
230                                              {"eps", 5},
231                                              {"grad", 6},
232                                              {"indices", 7}};
233 
234 const OptimOriginIdx kSparseFtrlOriginIdx = {{"weight", 0}, {"accum", 1}, {"linear", 2}, {"grad", 3}, {"indices", 4}};
235 const OptimPSSendIdx kSparseFtrlPSSendIdx = {
236   {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"linear", INDEX_NOT_SEND}, {"grad", 0}, {"indices", 1}};
237 
238 const std::map<std::string, OptimOriginIdx> kOptimToOriginIdx = {{kApplyMomentum, kMomentumOriginIdx},
239                                                                  {kSparseAdam, kSparseAdamOriginIdx},
240                                                                  {kSparseLazyAdam, kSparseAdamOriginIdx},
241                                                                  {kSparseFtrl, kSparseFtrlOriginIdx}};
242 const std::map<std::string, OptimOriginIdx> kOptimToPSSendIdx = {{kApplyMomentum, kMomentumPSSendIdx},
243                                                                  {kSparseAdam, kSparseAdamPSSendIdx},
244                                                                  {kSparseLazyAdam, kSparseAdamPSSendIdx},
245                                                                  {kSparseFtrl, kSparseFtrlPSSendIdx}};
246 
247 // The barrier function which should be called before doing scaling out/in operations.
248 // It's easy for us to scale out/in nodes after one iteration is completed and keep consistent.
249 using BarrierBeforeScaleOut = std::function<void(void)>;
250 using BarrierBeforeScaleIn = std::function<void(void)>;
251 
252 // These handlers helps worker/server node to reinitialize or recover data after scaling out/in operation of scheduler
253 // is done.
254 using HandlerAfterScaleOut = std::function<void(void)>;
255 using HandlerAfterScaleIn = std::function<void(void)>;
256 using HandlerAfterScaleOutRollback = std::function<void(void)>;
257 
258 constexpr char kClusterNotReady[] =
259   "The Scheduler's connections are not equal with total node num, Maybe this is because some server nodes are drop "
260   "out or scale in nodes has not been recycled.";
261 constexpr char kJobNotReady[] = "The server's training job is not ready.";
262 constexpr char kClusterSafeMode[] = "The cluster is in safemode.";
263 constexpr char kJobNotAvailable[] = "The server's training job is disabled or finished.";
264 
265 enum class UserDefineEvent { kIterationRunning = 0, kIterationCompleted, kNodeTimeout };
266 
267 #define EXC_IF_VEC_IDX_OOB(vec, idx)                                                            \
268   do {                                                                                          \
269     size_t vec_size = vec.size();                                                               \
270     if (idx >= vec_size) {                                                                      \
271       MS_LOG(EXCEPTION) << "Vector " << #vec << " size is " << vec_size << ". So index " << idx \
272                         << " is out of bound.";                                                 \
273     }                                                                                           \
274   } while (0)
275 
276 #define ERROR_STATUS(result, code, message)       \
277   do {                                            \
278     MS_LOG(ERROR) << message;                     \
279     result = RequestProcessResult(code, message); \
280   } while (0)
281 
282 #define CHECK_RETURN_TYPE(_condition)                    \
283   do {                                                   \
284     if (!(_condition)) {                                 \
285       MS_LOG(ERROR) << "Parse protobuf message failed."; \
286     }                                                    \
287   } while (false)
288 }  // namespace ps
289 }  // namespace mindspore
290 #endif  // MINDSPORE_CCSRC_PS_CONSTANTS_H_
291