• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_PS_CONSTANTS_H_
18 #define MINDSPORE_CCSRC_PS_CONSTANTS_H_
19 
20 #include <limits.h>
21 
22 #include <iostream>
23 #include <vector>
24 #include <memory>
25 #include <map>
26 #include <string>
27 #include <functional>
28 
29 #include "ps/core/communicator/request_process_result_code.h"
30 
31 namespace mindspore {
32 namespace ps {
33 constexpr char kEnvCommType[] = "MS_COMM_TYPE";
34 constexpr char kEnvInterface[] = "MS_INTERFACE";
35 constexpr char kEnvPServerNum[] = "MS_SERVER_NUM";
36 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM";
37 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST";
38 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT";
39 constexpr char kEnvSchedulerManagePort[] = "MS_SCHED_MANAGE_PORT";
40 constexpr char kEnvNodeId[] = "MS_NODE_ID";
41 
42 constexpr char kCommTypeOfIBVerbs[] = "ibverbs";
43 constexpr char kRoleOfPServer[] = "server";
44 constexpr char kRoleOfWorker[] = "worker";
45 constexpr char kRoleOfScheduler[] = "scheduler";
46 
47 constexpr char kLearningRate[] = "learning_rate";
48 constexpr char kMomentum[] = "momentum";
49 
50 constexpr char kApplyMomentum[] = "ApplyMomentum";
51 constexpr char kSparseAdam[] = "Adam";
52 constexpr char kSparseLazyAdam[] = "LazyAdam";
53 constexpr char kSparseFtrl[] = "Ftrl";
54 constexpr char kApplyMomentumOp[] = "Momentum";
55 constexpr char kSparseAdamOp[] = "Adam";
56 constexpr char kSparseLazyAdamOp[] = "LazyAdam";
57 constexpr char kSparseFtrlOp[] = "FTRL";
58 
59 constexpr char kCertificateChain[] = "server.crt";
60 constexpr char kPrivateKey[] = "server.key";
61 constexpr char kCAcrt[] = "ca.crt";
62 
63 constexpr int64_t kInitWeightsCmd = 10;
64 constexpr int64_t kInitWeightToOptimIdCmd = 11;
65 constexpr int64_t kInitOptimInputsShapeCmd = 12;
66 constexpr int64_t kInitKeyToPushNodeIdCmd = 13;
67 constexpr int64_t kInitEmbeddingsCmd = 20;
68 constexpr int64_t kUpdateEmbeddingsCmd = 21;
69 constexpr int64_t kCheckReadyForPushCmd = 25;
70 constexpr int64_t kCheckReadyForPullCmd = 26;
71 constexpr int64_t kEmbeddingLookupCmd = 30;
72 constexpr int64_t kFinalizeCmd = 40;
73 constexpr int64_t kPushCmd = 50;
74 constexpr int64_t kPullCmd = 51;
75 
76 constexpr size_t kInvalidKey = UINT64_MAX;
77 constexpr int64_t kInvalidID = -1;
78 
79 constexpr int64_t kGradIndex = 0;
80 constexpr int64_t kIndiceIndex = 1;
81 constexpr int64_t kFirstDimSize = 2;
82 constexpr int64_t kOutDimSize = 3;
83 
84 constexpr int64_t kBase = 10;
85 constexpr float kStdDev = 0.01;
86 
87 constexpr int64_t kSparseLazyAdamIndex = 2;
88 constexpr int64_t kSparseFtrlIndex = 3;
89 constexpr int64_t kSparseGradIndex = 6;
90 constexpr int64_t kSparseIndiceIndex = 7;
91 
92 constexpr int64_t kHeartbeatTimes = 2;
93 constexpr int64_t kGradValue = -100;
94 
95 constexpr uint32_t kMaxMessageSize = static_cast<uint32_t>(100 * (uint32_t(1) << 20));
96 constexpr char kServerNum[] = "server_num";
97 constexpr char kWorkerNum[] = "worker_num";
98 constexpr char kNodesIds[] = "node_ids";
99 constexpr char kNodeId[] = "node_id";
100 
101 constexpr int64_t kSubmitTaskIntervalInMs = 1;
102 constexpr int64_t kMaxTaskNum = 10240;
103 constexpr int64_t kSubmitTimeOutInMs = 30000;
104 constexpr int64_t kRetryCount = 60;
105 constexpr int64_t kRetryIntervalInMs = 10;
106 
107 constexpr int64_t kThreadNum = 32;
108 
109 // The timeout period for the scale in node to send the finish message to scheduler.
110 constexpr uint32_t kScaleInTimeoutInSenconds = 30;
111 // The number of retries to determine whether all nodes are successfully registered.
112 constexpr uint32_t kCheckRegisteredRetryCount = 30;
113 // The timeout interval for judging whether all nodes are successfully registered.
114 constexpr uint32_t kCheckRegisteredIntervalInMs = 1000;
115 
116 // The type of persistent storage, currently only supports file storage.
117 constexpr char kStoreType[] = "storage_type";
118 // The file used to storage metadata.
119 constexpr char kStoreFilePath[] = "storage_file_path";
120 // 1 indicates that the persistent storage type is file.
121 constexpr char kFileStorage[] = "1";
122 // The recovery key of json_config.
123 constexpr char kKeyRecovery[] = "recovery";
124 constexpr char kRecoveryWorkerNum[] = "worker_num";
125 constexpr char kRecoveryServerNum[] = "server_num";
126 constexpr char kRecoverySchedulerIp[] = "scheduler_ip";
127 constexpr char kRecoverySchedulerPort[] = "scheduler_port";
128 
129 constexpr char kServerCertPath[] = "server_cert_path";
130 constexpr char kServerPassword[] = "server_password";
131 constexpr char kCrlPath[] = "crl_path";
132 constexpr char kClientCertPath[] = "client_cert_path";
133 constexpr char kClientPassword[] = "client_password";
134 constexpr char kCaCertPath[] = "ca_cert_path";
135 
136 constexpr char kCipherList[] = "cipher_list";
137 constexpr char kCertCheckInterval[] = "cert_check_interval_in_hour";
138 // 7 * 24
139 constexpr int64_t kCertCheckIntervalInHour = 168;
140 constexpr char kCertExpireWarningTime[] = "cert_expire_warning_time_in_day";
141 // 90
142 constexpr int64_t kCertExpireWarningTimeInDay = 90;
143 constexpr char kConnectionNum[] = "connection_num";
144 constexpr int64_t kConnectionNumDefault = 10000;
145 constexpr char kLocalIp[] = "127.0.0.1";
146 
147 constexpr int64_t kJanuary = 1;
148 constexpr int64_t kSeventyYear = 70;
149 constexpr int64_t kHundredYear = 100;
150 constexpr int64_t kThousandYear = 1000;
151 constexpr int64_t kBaseYear = 1900;
152 constexpr int64_t kMinWarningTime = 7;
153 constexpr int64_t kMaxWarningTime = 180;
154 
155 constexpr int64_t kLength = 100;
156 constexpr int64_t kMaxPort = 65535;
157 
158 constexpr char kTcpCommunicator[] = "TCP";
159 constexpr char kHttpCommunicator[] = "HTTP";
160 
161 constexpr char kServerCert[] = "server.p12";
162 constexpr char kClientCert[] = "client.p12";
163 constexpr char kCaCert[] = "ca.crt";
164 constexpr char kColon = ':';
165 const std::map<std::string, size_t> kCiphers = {{"ECDHE-RSA-AES128-GCM-SHA256", 0},
166                                                 {"ECDHE-ECDSA-AES128-GCM-SHA256", 1},
167                                                 {"ECDHE-RSA-AES256-GCM-SHA384", 2},
168                                                 {"ECDHE-ECDSA-AES256-GCM-SHA384", 3},
169                                                 {"DHE-RSA-AES128-GCM-SHA256", 4},
170                                                 {"DHE-DSS-AES128-GCM-SHA256", 5},
171                                                 {"ECDHE-RSA-AES128-SHA256", 6},
172                                                 {"ECDHE-ECDSA-AES128-SHA256", 7},
173                                                 {"ECDHE-RSA-AES128-SHA", 8},
174                                                 {"ECDHE-ECDSA-AES128-SHA", 9},
175                                                 {"ECDHE-RSA-AES256-SHA384", 10},
176                                                 {"ECDHE-ECDSA-AES256-SHA384", 11},
177                                                 {"ECDHE-RSA-AES256-SHA", 12},
178                                                 {"ECDHE-ECDSA-AES256-SHA", 13},
179                                                 {"DHE-RSA-AES128-SHA256", 14},
180                                                 {"DHE-RSA-AES128-SHA", 15},
181                                                 {"DHE-DSS-AES128-SHA256", 16},
182                                                 {"DHE-RSA-AES256-SHA256", 17},
183                                                 {"DHE-DSS-AES256-SHA", 18},
184                                                 {"DHE-RSA-AES256-SHA", 19},
185                                                 {"!aNULL", 20},
186                                                 {"!eNULL", 21},
187                                                 {"!EXPORT", 22},
188                                                 {"!DES", 23},
189                                                 {"!RC4", 24},
190                                                 {"!3DES", 25},
191                                                 {"!MD5", 26},
192                                                 {"!PSK", 27},
193                                                 {"kEDH+AESGCM", 28}};
194 
195 using DataPtr = std::shared_ptr<unsigned char[]>;
196 using VectorPtr = std::shared_ptr<std::vector<unsigned char>>;
197 using Key = uint64_t;
198 using Keys = std::vector<Key>;
199 using Values = std::vector<float>;
200 using ValuesPtr = std::shared_ptr<Values>;
201 using Weight = std::vector<float>;
202 using Grad = std::vector<float>;
203 using LookupIds = std::vector<Key>;
204 using Lengths = std::vector<int>;
205 using WeightPtr = std::shared_ptr<Weight>;
206 using GradPtr = std::shared_ptr<Grad>;
207 using InputsShape = std::vector<std::shared_ptr<std::vector<size_t>>>;
208 using InputsShapePtr = std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>>;
209 
210 constexpr size_t INDEX_NOT_SEND = UINT_MAX;
211 using OptimOriginIdx = std::map<std::string, size_t>;
212 using OptimPSSendIdx = std::map<std::string, size_t>;
213 
214 using EventCallback = std::function<void(void)>;
215 
216 const OptimOriginIdx kMomentumOriginIdx = {{"weight", 0}, {"accum", 1}, {"lr", 2}, {"grad", 3}, {"momentum", 4}};
217 const OptimPSSendIdx kMomentumPSSendIdx = {
218   {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"lr", 0}, {"grad", 1}, {"momentum", 2}};
219 
220 const OptimOriginIdx kSparseAdamOriginIdx = {{"weight", 0},      {"m", 1},    {"v", 2},       {"beta1_power", 3},
221                                              {"beta2_power", 4}, {"lr", 5},   {"beta1", 6},   {"beta2", 7},
222                                              {"eps", 8},         {"grad", 9}, {"indices", 10}};
223 const OptimPSSendIdx kSparseAdamPSSendIdx = {{"weight", INDEX_NOT_SEND},
224                                              {"m", INDEX_NOT_SEND},
225                                              {"v", INDEX_NOT_SEND},
226                                              {"beta1_power", 0},
227                                              {"beta2_power", 1},
228                                              {"lr", 2},
229                                              {"beta1", 3},
230                                              {"beta2", 4},
231                                              {"eps", 5},
232                                              {"grad", 6},
233                                              {"indices", 7}};
234 
235 const OptimOriginIdx kSparseFtrlOriginIdx = {{"weight", 0}, {"accum", 1}, {"linear", 2}, {"grad", 3}, {"indices", 4}};
236 const OptimPSSendIdx kSparseFtrlPSSendIdx = {
237   {"weight", INDEX_NOT_SEND}, {"accum", INDEX_NOT_SEND}, {"linear", INDEX_NOT_SEND}, {"grad", 0}, {"indices", 1}};
238 
239 const std::map<std::string, OptimOriginIdx> kOptimToOriginIdx = {{kApplyMomentum, kMomentumOriginIdx},
240                                                                  {kSparseAdam, kSparseAdamOriginIdx},
241                                                                  {kSparseLazyAdam, kSparseAdamOriginIdx},
242                                                                  {kSparseFtrl, kSparseFtrlOriginIdx}};
243 const std::map<std::string, OptimOriginIdx> kOptimToPSSendIdx = {{kApplyMomentum, kMomentumPSSendIdx},
244                                                                  {kSparseAdam, kSparseAdamPSSendIdx},
245                                                                  {kSparseLazyAdam, kSparseAdamPSSendIdx},
246                                                                  {kSparseFtrl, kSparseFtrlPSSendIdx}};
247 
248 // The barrier function which should be called before doing scaling out/in operations.
249 // It's easy for us to scale out/in nodes after one iteration is completed and keep consistent.
250 using BarrierBeforeScaleOut = std::function<void(void)>;
251 using BarrierBeforeScaleIn = std::function<void(void)>;
252 
253 // These handlers helps worker/server node to reinitialize or recover data after scaling out/in operation of scheduler
254 // is done.
255 using HandlerAfterScaleOut = std::function<void(void)>;
256 using HandlerAfterScaleIn = std::function<void(void)>;
257 
258 constexpr char kClusterSafeMode[] = "The cluster is in safemode.";
259 constexpr char kJobNotAvailable[] = "The server's training job is disabled or finished.";
260 
261 enum class CustomEvent { kIterationRunning = 0, kIterationCompleted };
262 
263 #define EXC_IF_VEC_IDX_OOB(vec, idx)                                                            \
264   {                                                                                             \
265     size_t vec_size = vec.size();                                                               \
266     if (idx >= vec_size) {                                                                      \
267       MS_LOG(EXCEPTION) << "Vector " << #vec << " size is " << vec_size << ". So index " << idx \
268                         << " is out of bound.";                                                 \
269     }                                                                                           \
270   }
271 
272 #define ERROR_STATUS(result, code, message) \
273   MS_LOG(ERROR) << message;                 \
274   result = RequestProcessResult(code, message)
275 
276 #define CHECK_RETURN_TYPE(_condition)                    \
277   do {                                                   \
278     if (!(_condition)) {                                 \
279       MS_LOG(ERROR) << "Parse protobuf message failed."; \
280     }                                                    \
281   } while (false)
282 }  // namespace ps
283 }  // namespace mindspore
284 #endif  // MINDSPORE_CCSRC_PS_CONSTANTS_H_
285