• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_
18 #define MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_
19 
20 #include <set>
21 #include <map>
22 #include <chrono>
23 #include <string>
24 #include <vector>
25 #include <utility>
26 #include <functional>
27 
28 #include "actor/log.h"
29 #include "actor/msg.h"
30 #include "utils/ms_utils.h"
31 #include "utils/log_adapter.h"
32 
33 namespace mindspore {
34 namespace distributed {
35 // The detailed reason of failing to run 'mindspore.communication.init()' with ClusterContext.
36 constexpr char kDetailedFailureReason[] =
37   "Maybe you are trying to call 'mindspore.communication.init()' without using 'mpirun', which will make MindSpore "
38   "load several environment variables and check their validation. Please use 'mpirun' to launch this process to fix "
39   "this issue, or refer to this link if you want to run distributed training without using 'mpirun': "
40   "https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/dynamic_cluster.html";
41 
42 constexpr char kWorkerProcessNotEnoughError[] = "Spawned worker process number is not as expected.";
43 constexpr char kSchedPortOccupiedError[] = "Configured scheduler port MS_SCHED_PORT is occupied by other processes.";
44 constexpr char kSchedWorkerAddrNotConsistentError[] =
45   "Scheduler and worker's configured MS_SCHED_HOST or MS_SCHED_PORT is not consistent with each other.";
46 
47 constexpr char kEnvServerNum[] = "MS_SERVER_NUM";
48 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM";
49 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST";
50 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT";
51 
52 constexpr char kEnvRole[] = "MS_ROLE";
53 constexpr char kEnvRoleOfServer[] = "MS_SERVER";
54 constexpr char kEnvRoleOfPServer[] = "MS_PSERVER";
55 constexpr char kEnvRoleOfWorker[] = "MS_WORKER";
56 constexpr char kEnvRoleOfScheduler[] = "MS_SCHED";
57 const std::set<std::string> kValidRoleName = {kEnvRoleOfServer, kEnvRoleOfPServer, kEnvRoleOfWorker,
58                                               kEnvRoleOfScheduler};
59 
60 // Denote which ip address is used for cluster building.
61 constexpr char kEnvWorkerIp[] = "MS_WORKER_IP";
62 
63 // Used in parameter server embedding cache scenarios to identify the same Parameter between Worker and Server.
64 constexpr char kParameterKey[] = "parameter_key";
65 // Embedding cache lookup operation.
66 constexpr char kLookupEmbeddingCache[] = "LookupEmbeddingCache";
67 // Embedding cache update operation.
68 constexpr char kUpdateEmbeddingCache[] = "UpdateEmbeddingCache";
69 const std::vector<std::string> kEmbeddingCacheOps = {kLookupEmbeddingCache, kUpdateEmbeddingCache};
70 // Message header of finalize mux recv actor.
71 constexpr char kFinalizeMuxRecvActor[] = "FINALIZE_MUX_RECV_ACTOR";
72 
73 // The distributed execution mode enum.
74 // For each execution mode, different graph optimization, splitting strategy, device location, etc are applied. For
75 // details please refer to class DistributedExecutionMode and its subclasses.
76 
77 // kGeneralMode: Simply split a training graph into multiple devices without other extra features.
78 
79 // kParallelMode: MindSpore's existing auto-parallel feature along with distributed graph splitting feature are
80 // combined. This is much more complicated than other mode. It is always applied in MoE scenarios.
81 
82 // kPSMode: Applied when running Parameter Server training.
83 
84 // kEmbeddingCacheMode: Applied when embedding cache is enabled. Normally used for training models with large embedding
85 // layer.
86 enum class DistExecutionMode { kGeneralMode = 0, kParallelMode, kPSMode, kEmbeddingCacheMode, kInvalidMode };
87 
88 // The operator's label in distributed execution.
89 constexpr char kOpLabelRankId[] = "rank_id";
90 constexpr char kOpLabelRole[] = "ms_role";
91 
92 constexpr char kLocalHost[] = "127.0.0.1";
93 constexpr int MAX_HOSTNAME_LEN = 1024;
94 const uint16_t kDefaultSchedPort = 6667;
95 const uint16_t kMaxPort = 65535;
96 constexpr uint32_t kDefaultFinishTimeout = 30;
97 
98 // For each computing graph node, there is a range for rpc server's port number.
99 // Each node has range number 2048, and the port started from 8118.
100 constexpr uint32_t kStartPort = 8118;
101 constexpr uint32_t kNodePortRangeNum = 4096;
102 constexpr char kNodePortRange[] = "node_port_range";
103 using ServerPortRange = std::pair<uint32_t, uint32_t>;
104 
105 constexpr char kDataSyncSrcOpName[] = "DataSyncSrc";
106 constexpr char kDataSyncDstOpName[] = "DataSyncDst";
107 constexpr char kControlSrcOpName[] = "ControlSrc";
108 constexpr char kControlDstOpName[] = "ControlDst";
109 
110 static const char URL_PROTOCOL_IP_SEPARATOR[] = "://";
111 static const char URL_IP_PORT_SEPARATOR[] = ":";
112 
113 constexpr char kEnableRDMA[] = "enable_rdma";
114 constexpr char kRDMADevName[] = "rdma_dev";
115 constexpr char kRDMAIP[] = "rdma_ip";
116 
117 constexpr char kDefaultIP[] = "1.1.8.203";
118 constexpr char kDefaultIfName[] = "hrn0_2";
119 constexpr uint16_t kDefaultPort = 10969;
120 
121 // The interval of retrying connecting for rpc clients.
122 constexpr uint32_t kRetryConnectInterval = 2;
123 
124 // Time of retrying with increasing port number.
125 constexpr uint32_t kMaxRetryPortNum = 10;
126 
127 // The remote function id which will be increased progressively.
128 inline uint32_t kRemoteFuncId = 0;
129 
130 // Rank list vector, could be [m, n] or [m, m+1, ..., m+n].
131 using RankList = std::vector<uint32_t>;
132 
133 // This macro the current timestamp in milliseconds.
134 #define CURRENT_TIMESTAMP_MILLI \
135   (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()))
136 
137 using MessageHandler = std::function<MessageBase *const(MessageBase *const)>;
138 
139 /**
140  * @description: The callback function type for allocating memory after receiving data for the peer.
141  * @param {size_t} size: Size of the memory to be allocated.
142  * @return {void *}: A pointer to the newly allocated memory.
143  */
144 using MemAllocateCallback = std::function<void *(size_t size)>;
145 
146 /**
147  * @description: The callback function for releasing memory after sending it to the peer.
148  * @param {void} *data: The memory to be released, which should be allocated on heap.
149  * @return {bool}: Whether the memory is successfully released.
150  */
151 using MemFreeCallback = std::function<bool(void *data)>;
152 }  // namespace distributed
153 }  // namespace mindspore
154 #endif  // MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_
155