1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_ 18 #define MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_ 19 20 #include <set> 21 #include <map> 22 #include <chrono> 23 #include <string> 24 #include <vector> 25 #include <utility> 26 #include <functional> 27 28 #include "actor/log.h" 29 #include "actor/msg.h" 30 #include "utils/ms_utils.h" 31 #include "utils/log_adapter.h" 32 33 namespace mindspore { 34 namespace distributed { 35 // The detailed reason of failing to run 'mindspore.communication.init()' with ClusterContext. 36 constexpr char kDetailedFailureReason[] = 37 "Maybe you are trying to call 'mindspore.communication.init()' without using 'mpirun', which will make MindSpore " 38 "load several environment variables and check their validation. Please use 'mpirun' to launch this process to fix " 39 "this issue, or refer to this link if you want to run distributed training without using 'mpirun': " 40 "https://www.mindspore.cn/tutorials/experts/zh-CN/master/parallel/dynamic_cluster.html"; 41 42 constexpr char kWorkerProcessNotEnoughError[] = "Spawned worker process number is not as expected."; 43 constexpr char kSchedPortOccupiedError[] = "Configured scheduler port MS_SCHED_PORT is occupied by other processes."; 44 constexpr char kSchedWorkerAddrNotConsistentError[] = 45 "Scheduler and worker's configured MS_SCHED_HOST or MS_SCHED_PORT is not consistent with each other."; 46 47 constexpr char kEnvServerNum[] = "MS_SERVER_NUM"; 48 constexpr char kEnvWorkerNum[] = "MS_WORKER_NUM"; 49 constexpr char kEnvSchedulerHost[] = "MS_SCHED_HOST"; 50 constexpr char kEnvSchedulerPort[] = "MS_SCHED_PORT"; 51 52 constexpr char kEnvRole[] = "MS_ROLE"; 53 constexpr char kEnvRoleOfServer[] = "MS_SERVER"; 54 constexpr char kEnvRoleOfPServer[] = "MS_PSERVER"; 55 constexpr char kEnvRoleOfWorker[] = "MS_WORKER"; 56 constexpr char kEnvRoleOfScheduler[] = "MS_SCHED"; 57 const std::set<std::string> kValidRoleName = {kEnvRoleOfServer, kEnvRoleOfPServer, kEnvRoleOfWorker, 58 kEnvRoleOfScheduler}; 59 60 // Denote which ip address is used for cluster building. 61 constexpr char kEnvWorkerIp[] = "MS_WORKER_IP"; 62 63 // Used in parameter server embedding cache scenarios to identify the same Parameter between Worker and Server. 64 constexpr char kParameterKey[] = "parameter_key"; 65 // Embedding cache lookup operation. 66 constexpr char kLookupEmbeddingCache[] = "LookupEmbeddingCache"; 67 // Embedding cache update operation. 68 constexpr char kUpdateEmbeddingCache[] = "UpdateEmbeddingCache"; 69 const std::vector<std::string> kEmbeddingCacheOps = {kLookupEmbeddingCache, kUpdateEmbeddingCache}; 70 // Message header of finalize mux recv actor. 71 constexpr char kFinalizeMuxRecvActor[] = "FINALIZE_MUX_RECV_ACTOR"; 72 73 // The distributed execution mode enum. 74 // For each execution mode, different graph optimization, splitting strategy, device location, etc are applied. For 75 // details please refer to class DistributedExecutionMode and its subclasses. 76 77 // kGeneralMode: Simply split a training graph into multiple devices without other extra features. 78 79 // kParallelMode: MindSpore's existing auto-parallel feature along with distributed graph splitting feature are 80 // combined. This is much more complicated than other mode. It is always applied in MoE scenarios. 81 82 // kPSMode: Applied when running Parameter Server training. 83 84 // kEmbeddingCacheMode: Applied when embedding cache is enabled. Normally used for training models with large embedding 85 // layer. 86 enum class DistExecutionMode { kGeneralMode = 0, kParallelMode, kPSMode, kEmbeddingCacheMode, kInvalidMode }; 87 88 // The operator's label in distributed execution. 89 constexpr char kOpLabelRankId[] = "rank_id"; 90 constexpr char kOpLabelRole[] = "ms_role"; 91 92 constexpr char kLocalHost[] = "127.0.0.1"; 93 constexpr int MAX_HOSTNAME_LEN = 1024; 94 const uint16_t kDefaultSchedPort = 6667; 95 const uint16_t kMaxPort = 65535; 96 constexpr uint32_t kDefaultFinishTimeout = 30; 97 98 // For each computing graph node, there is a range for rpc server's port number. 99 // Each node has range number 2048, and the port started from 8118. 100 constexpr uint32_t kStartPort = 8118; 101 constexpr uint32_t kNodePortRangeNum = 4096; 102 constexpr char kNodePortRange[] = "node_port_range"; 103 using ServerPortRange = std::pair<uint32_t, uint32_t>; 104 105 constexpr char kDataSyncSrcOpName[] = "DataSyncSrc"; 106 constexpr char kDataSyncDstOpName[] = "DataSyncDst"; 107 constexpr char kControlSrcOpName[] = "ControlSrc"; 108 constexpr char kControlDstOpName[] = "ControlDst"; 109 110 static const char URL_PROTOCOL_IP_SEPARATOR[] = "://"; 111 static const char URL_IP_PORT_SEPARATOR[] = ":"; 112 113 constexpr char kEnableRDMA[] = "enable_rdma"; 114 constexpr char kRDMADevName[] = "rdma_dev"; 115 constexpr char kRDMAIP[] = "rdma_ip"; 116 117 constexpr char kDefaultIP[] = "1.1.8.203"; 118 constexpr char kDefaultIfName[] = "hrn0_2"; 119 constexpr uint16_t kDefaultPort = 10969; 120 121 // The interval of retrying connecting for rpc clients. 122 constexpr uint32_t kRetryConnectInterval = 2; 123 124 // Time of retrying with increasing port number. 125 constexpr uint32_t kMaxRetryPortNum = 10; 126 127 // The remote function id which will be increased progressively. 128 inline uint32_t kRemoteFuncId = 0; 129 130 // Rank list vector, could be [m, n] or [m, m+1, ..., m+n]. 131 using RankList = std::vector<uint32_t>; 132 133 // This macro the current timestamp in milliseconds. 134 #define CURRENT_TIMESTAMP_MILLI \ 135 (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch())) 136 137 using MessageHandler = std::function<MessageBase *const(MessageBase *const)>; 138 139 /** 140 * @description: The callback function type for allocating memory after receiving data for the peer. 141 * @param {size_t} size: Size of the memory to be allocated. 142 * @return {void *}: A pointer to the newly allocated memory. 143 */ 144 using MemAllocateCallback = std::function<void *(size_t size)>; 145 146 /** 147 * @description: The callback function for releasing memory after sending it to the peer. 148 * @param {void} *data: The memory to be released, which should be allocated on heap. 149 * @return {bool}: Whether the memory is successfully released. 150 */ 151 using MemFreeCallback = std::function<bool(void *data)>; 152 } // namespace distributed 153 } // namespace mindspore 154 #endif // MINDSPORE_CCSRC_DISTRIBUTED_CONSTANTS_H_ 155