1/** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17syntax = "proto3"; 18package mindspore.ps.core; 19option optimize_for = LITE_RUNTIME; 20 21enum NodeCommand { 22 TERMINATE = 0; 23 REGISTER = 1; 24 HEARTBEAT = 2; 25 SEND_DATA = 3; 26 // The worker or server asks the scheduler for metadata 27 FETCH_METADATA = 4; 28 FINISH = 5; 29 COLLECTIVE_SEND_DATA = 6; 30 // The scheduler actively sends metadata to the worker and server 31 SEND_METADATA = 7; 32 // This command is used to start scale out 33 SCALE_OUT = 8; 34 // This command is used to start scale in 35 SCALE_IN = 9; 36 // This command is used to synchronize the scale out status of the cluster 37 SCALE_OUT_DONE = 10; 38 // This command is used to synchronize the scale in status of the cluster 39 SCALE_IN_DONE = 11; 40 // This command is used to send user defined event. 41 SEND_EVENT = 12; 42} 43 44enum NodeRole { 45 SERVER = 0; 46 WORKER = 1; 47 SCHEDULER = 2; 48} 49 50message MessageMeta { 51 // the command of this message,for example: register,heartbeat,data 52 NodeCommand cmd = 1; 53 // the request id of this message 54 uint64 request_id = 2; 55 // the role of the current node: worker,server,scheduler 56 NodeRole role = 3; 57 // the current Node rank id,the worker node range is:[0,numOfWorker-1], the server node range is:[0, numOfServer-1] 58 uint32 rank_id = 4; 59 // User-defined commands 60 int32 user_cmd = 5; 61} 62 63message RegisterMessage { 64 // ip 65 string ip = 1; 66 // the port of this node 67 uint32 port = 2; 68 // the current Node unique id:0,1,2... 69 string node_id = 3; 70 // the role of the node: worker,server,scheduler 71 NodeRole role = 4; 72} 73 74message RegisterRespMessage { 75 string node_id = 1; 76} 77 78message HeartbeatMessage { 79 // the current Node unique id:0,1,2... 80 string node_id = 1; 81} 82 83enum NodeState { 84 NODE_STARTING = 0; 85 NODE_FINISH = 1; 86 NODE_READY = 2; 87} 88 89enum ClusterState { 90 ClUSTER_STARTING = 0; 91 CLUSTER_READY = 1; 92 CLUSTER_EXIT = 2; 93 NODE_TIMEOUT = 3; 94 CLUSTER_SCALE_OUT = 4; 95 CLUSTER_SCALE_IN = 5; 96 CLUSTER_NEW_INSTANCE = 6; 97 CLUSTER_ENABLE_FLS = 7; 98 CLUSTER_DISABLE_FLS = 8; 99} 100 101message HeartbeatRespMessage { 102 ClusterState cluster_state = 1; 103 repeated ServersMeta servers_meta = 2; 104 bool is_worker_or_server0 = 3; 105} 106 107message FetchServersMessage { 108 string node_id = 1; 109} 110 111message FetchServersRespMessage { 112 repeated ServersMeta servers_meta = 1; 113} 114 115message ServersMeta { 116 uint32 rank_id = 1; 117 string ip = 2; 118 int32 port = 3; 119 bool is_alive = 4; 120 NodeRole role = 5; 121 string node_id = 6; 122} 123 124message SendMetadataMessage { 125 repeated ServersMeta servers_meta = 1; 126 // the current worker number. 127 int32 worker_num = 2; 128 // the current server number. 129 int32 server_num = 3; 130 // the current cluster state. 131 ClusterState cluster_state = 4; 132 // The rank id of the node that received this message. 133 uint32 rank_id = 5; 134} 135 136message FinishMessage { 137 // the current Node unique id:0,1,2... 138 string node_id = 1; 139} 140 141message CommMessage { 142 MessageMeta pb_meta = 1; 143 bytes data = 2; 144} 145 146// The scheduler will broadcast the worker/server numbers after scale out to all nodes. 147message ScaleOutMessage { 148 // the worker number after scale out 149 int32 worker_num = 1; 150 // the server number after scale out 151 int32 server_num = 2; 152} 153 154// The scheduler will broadcast the worker/server numbers after scale in to all nodes. 155message ScaleInMessage { 156 // the worker number after scale in. 157 int32 worker_num = 1; 158 // the server number after scale in. 159 int32 server_num = 2; 160 // Determine whether the current node is a scale in node. 161 bool is_node_scale_in = 3; 162} 163 164// This message is sent to the scheduler to notify the completion of scale out 165message ScaleOutDoneMessage { 166 string node_id = 1; 167} 168 169// This message is sent to the scheduler to notify the completion of scale out 170message ScaleInDoneMessage { 171 string node_id = 1; 172} 173 174// This message is sent by the worker/server to the scheduler, and the scheduler is broadcast the event to all other nodes. 175message EventMessage { 176 uint32 event = 1; 177 string node_id = 2; 178} 179 180// scheduler broadcasts the event to all other nodes through this message 181message EventRespMessage { 182 uint32 event = 1; 183} 184 185message ScaleInFinishMessage { 186 bool is_all_nodes_registered = 1; 187} 188