• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17syntax = "proto3";
18package mindspore.ps.core;
19option optimize_for = LITE_RUNTIME;
20
21enum NodeCommand {
22  TERMINATE = 0;
23  REGISTER = 1;
24  HEARTBEAT = 2;
25  SEND_DATA = 3;
26  // The worker or server asks the scheduler for metadata
27  FETCH_METADATA = 4;
28  FINISH = 5;
29  COLLECTIVE_SEND_DATA = 6;
30  // The scheduler actively sends metadata to the worker and server
31  SEND_METADATA = 7;
32  // This command is used to start scale out
33  SCALE_OUT = 8;
34  // This command is used to start scale in
35  SCALE_IN = 9;
36  // This command is used to synchronize the scale out status of the cluster
37  SCALE_OUT_DONE = 10;
38  // This command is used to synchronize the scale in status of the cluster
39  SCALE_IN_DONE = 11;
40  // This command is used to send user defined event.
41  SEND_EVENT = 12;
42}
43
44enum NodeRole {
45  SERVER = 0;
46  WORKER = 1;
47  SCHEDULER = 2;
48}
49
50message MessageMeta {
51  // the command of this message,for example: register,heartbeat,data
52  NodeCommand cmd = 1;
53  // the request id of this message
54  uint64 request_id = 2;
55  // the role of the current node: worker,server,scheduler
56  NodeRole role = 3;
57  // the current Node rank id,the worker node range is:[0,numOfWorker-1], the server node range is:[0, numOfServer-1]
58  uint32 rank_id = 4;
59  // User-defined commands
60  int32 user_cmd = 5;
61}
62
63message RegisterMessage {
64  // ip
65  string ip = 1;
66  // the port of this node
67  uint32 port = 2;
68  // the current Node unique id:0,1,2...
69  string node_id = 3;
70  // the role of the node: worker,server,scheduler
71  NodeRole role = 4;
72}
73
74message RegisterRespMessage {
75  string node_id = 1;
76}
77
78message HeartbeatMessage {
79  // the current Node unique id:0,1,2...
80  string node_id = 1;
81}
82
83enum NodeState {
84  NODE_STARTING = 0;
85  NODE_FINISH = 1;
86  NODE_READY = 2;
87}
88
89enum ClusterState {
90  ClUSTER_STARTING = 0;
91  CLUSTER_READY = 1;
92  CLUSTER_EXIT = 2;
93  NODE_TIMEOUT = 3;
94  CLUSTER_SCALE_OUT = 4;
95  CLUSTER_SCALE_IN = 5;
96  CLUSTER_NEW_INSTANCE = 6;
97  CLUSTER_ENABLE_FLS = 7;
98  CLUSTER_DISABLE_FLS = 8;
99}
100
101message HeartbeatRespMessage {
102  ClusterState cluster_state = 1;
103  repeated ServersMeta servers_meta = 2;
104  bool is_worker_or_server0 = 3;
105}
106
107message FetchServersMessage {
108  string node_id = 1;
109}
110
111message FetchServersRespMessage {
112  repeated ServersMeta servers_meta = 1;
113}
114
115message ServersMeta {
116  uint32 rank_id = 1;
117  string ip = 2;
118  int32 port = 3;
119  bool is_alive = 4;
120  NodeRole role = 5;
121  string node_id = 6;
122}
123
124message SendMetadataMessage {
125  repeated ServersMeta servers_meta = 1;
126  // the current worker number.
127  int32 worker_num = 2;
128  // the current server number.
129  int32 server_num = 3;
130  // the current cluster state.
131  ClusterState cluster_state = 4;
132  // The rank id of the node that received this message.
133  uint32 rank_id = 5;
134}
135
136message FinishMessage {
137  // the current Node unique id:0,1,2...
138  string node_id = 1;
139}
140
141message CommMessage {
142  MessageMeta pb_meta = 1;
143  bytes data = 2;
144}
145
146// The scheduler will broadcast the worker/server numbers after scale out to all nodes.
147message ScaleOutMessage {
148  // the worker number after scale out
149  int32 worker_num = 1;
150  // the server number after scale out
151  int32 server_num = 2;
152}
153
154// The scheduler will broadcast the worker/server numbers after scale in to all nodes.
155message ScaleInMessage {
156  // the worker number after scale in.
157  int32 worker_num = 1;
158  // the server number after scale in.
159  int32 server_num = 2;
160  // Determine whether the current node is a scale in node.
161  bool is_node_scale_in = 3;
162}
163
164// This message is sent to the scheduler to notify the completion of scale out
165message ScaleOutDoneMessage {
166  string node_id = 1;
167}
168
169// This message is sent to the scheduler to notify the completion of scale out
170message ScaleInDoneMessage {
171  string node_id = 1;
172}
173
174// This message is sent by the worker/server to the scheduler, and the scheduler is broadcast the event to all other nodes.
175message EventMessage {
176  uint32 event = 1;
177  string node_id = 2;
178}
179
180// scheduler broadcasts the event to all other nodes through this message
181message EventRespMessage {
182  uint32 event = 1;
183}
184
185message ScaleInFinishMessage {
186  bool is_all_nodes_registered = 1;
187}
188