• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15
16# The script runs the process of server's disaster recovery. It will kill the server process and launch it again.
17
18import os
19import ast
20import argparse
21import subprocess
22
23parser = argparse.ArgumentParser(description="Run test_mobile_lenet.py case")
24parser.add_argument("--device_target", type=str, default="CPU")
25parser.add_argument("--server_mode", type=str, default="HYBRID_TRAINING")
26parser.add_argument("--worker_num", type=int, default=1)
27parser.add_argument("--server_num", type=int, default=2)
28parser.add_argument("--scheduler_ip", type=str, default="127.0.0.1")
29parser.add_argument("--scheduler_port", type=int, default=8113)
30#The fl server port of the server which needs to be killed.
31parser.add_argument("--disaster_recovery_server_port", type=int, default=10976)
32parser.add_argument("--node_id", type=str, default="")
33parser.add_argument("--start_fl_job_threshold", type=int, default=1)
34parser.add_argument("--start_fl_job_time_window", type=int, default=3000)
35parser.add_argument("--update_model_ratio", type=float, default=1.0)
36parser.add_argument("--update_model_time_window", type=int, default=3000)
37parser.add_argument("--fl_name", type=str, default="Lenet")
38parser.add_argument("--fl_iteration_num", type=int, default=25)
39parser.add_argument("--client_epoch_num", type=int, default=20)
40parser.add_argument("--client_batch_size", type=int, default=32)
41parser.add_argument("--client_learning_rate", type=float, default=0.1)
42parser.add_argument("--root_first_ca_path", type=str, default="")
43parser.add_argument("--root_second_ca_path", type=str, default="")
44parser.add_argument("--pki_verify", type=ast.literal_eval, default=False)
45parser.add_argument("--root_first_crl_path", type=str, default="")
46parser.add_argument("--root_second_crl_path", type=str, default="")
47parser.add_argument("--sts_jar_path", type=str, default="")
48parser.add_argument("--sts_properties_path", type=str, default="")
49parser.add_argument("--dp_eps", type=float, default=50.0)
50parser.add_argument("--dp_delta", type=float, default=0.01)  # usually equals 1/start_fl_job_threshold
51parser.add_argument("--dp_norm_clip", type=float, default=1.0)
52parser.add_argument("--encrypt_type", type=str, default="NOT_ENCRYPT")
53parser.add_argument("--client_password", type=str, default="")
54parser.add_argument("--server_password", type=str, default="")
55parser.add_argument("--enable_ssl", type=ast.literal_eval, default=False)
56
57
58args, _ = parser.parse_known_args()
59device_target = args.device_target
60server_mode = args.server_mode
61worker_num = args.worker_num
62server_num = args.server_num
63scheduler_ip = args.scheduler_ip
64scheduler_port = args.scheduler_port
65disaster_recovery_server_port = args.disaster_recovery_server_port
66node_id = args.node_id
67start_fl_job_threshold = args.start_fl_job_threshold
68start_fl_job_time_window = args.start_fl_job_time_window
69update_model_ratio = args.update_model_ratio
70update_model_time_window = args.update_model_time_window
71fl_name = args.fl_name
72fl_iteration_num = args.fl_iteration_num
73client_epoch_num = args.client_epoch_num
74client_batch_size = args.client_batch_size
75client_learning_rate = args.client_learning_rate
76root_first_ca_path = args.root_first_ca_path
77root_second_ca_path = args.root_second_ca_path
78pki_verify = args.pki_verify
79root_first_crl_path = args.root_first_crl_path
80root_second_crl_path = args.root_second_crl_path
81sts_jar_path = args.sts_jar_path
82sts_properties_path = args.sts_properties_path
83dp_eps = args.dp_eps
84dp_delta = args.dp_delta
85dp_norm_clip = args.dp_norm_clip
86encrypt_type = args.encrypt_type
87client_password = args.client_password
88server_password = args.server_password
89enable_ssl = args.enable_ssl
90
91#Step 1: make the server offline.
92offline_cmd = "ps_demo_id=`ps -ef | grep " + str(disaster_recovery_server_port) \
93              + "|grep -v cd  | grep -v grep | grep -v run_server_disaster_recovery | awk '{print $2}'`"
94offline_cmd += " && for id in $ps_demo_id; do kill -9 $id && echo \"Killed server process: $id\"; done"
95subprocess.call(['bash', '-c', offline_cmd])
96
97#Step 2: Wait 3 seconds for recovery.
98wait_cmd = "echo \"Start to sleep for 3 seconds\" && sleep 3"
99subprocess.call(['bash', '-c', wait_cmd])
100
101#Step 3: Launch the server again with the same fl server port.
102os.environ['MS_NODE_ID'] = str(node_id)
103cmd_server = "execute_path=$(pwd) && self_path=$(dirname \"${script_self}\") && "
104cmd_server += "rm -rf ${execute_path}/disaster_recovery_server_" + str(disaster_recovery_server_port) + "/ &&"
105cmd_server += "mkdir ${execute_path}/disaster_recovery_server_" + str(disaster_recovery_server_port) + "/ &&"
106cmd_server += "cd ${execute_path}/disaster_recovery_server_" + str(disaster_recovery_server_port) \
107              + "/ || exit && export GLOG_v=1 &&"
108cmd_server += "python ${self_path}/../test_mobile_lenet.py"
109cmd_server += " --device_target=" + device_target
110cmd_server += " --server_mode=" + server_mode
111cmd_server += " --ms_role=MS_SERVER"
112cmd_server += " --worker_num=" + str(worker_num)
113cmd_server += " --server_num=" + str(server_num)
114cmd_server += " --scheduler_ip=" + scheduler_ip
115cmd_server += " --scheduler_port=" + str(scheduler_port)
116cmd_server += " --fl_server_port=" + str(disaster_recovery_server_port)
117cmd_server += " --start_fl_job_threshold=" + str(start_fl_job_threshold)
118cmd_server += " --enable_ssl=" + str(enable_ssl)
119cmd_server += " --start_fl_job_time_window=" + str(start_fl_job_time_window)
120cmd_server += " --update_model_ratio=" + str(update_model_ratio)
121cmd_server += " --update_model_time_window=" + str(update_model_time_window)
122cmd_server += " --fl_name=" + fl_name
123cmd_server += " --fl_iteration_num=" + str(fl_iteration_num)
124cmd_server += " --client_epoch_num=" + str(client_epoch_num)
125cmd_server += " --client_batch_size=" + str(client_batch_size)
126cmd_server += " --client_learning_rate=" + str(client_learning_rate)
127cmd_server += " --dp_eps=" + str(dp_eps)
128cmd_server += " --dp_delta=" + str(dp_delta)
129cmd_server += " --dp_norm_clip=" + str(dp_norm_clip)
130cmd_server += " --encrypt_type=" + str(encrypt_type)
131cmd_server += " --root_first_ca_path=" + str(root_first_ca_path)
132cmd_server += " --root_second_ca_path=" + str(root_second_ca_path)
133cmd_server += " --pki_verify=" + str(pki_verify)
134cmd_server += " --root_first_crl_path=" + str(root_first_crl_path)
135cmd_server += " --root_second_crl_path=" + str(root_second_crl_path)
136cmd_server += " --root_second_crl_path=" + str(root_second_crl_path)
137cmd_server += " --client_password=" + str(client_password)
138cmd_server += " --server_password=" + str(server_password)
139cmd_server += " --enable_ssl=" + str(enable_ssl)
140cmd_server += " --sts_jar_path=" + str(sts_jar_path)
141cmd_server += " --sts_properties_path=" + str(sts_properties_path)
142cmd_server += " > server.log 2>&1 &"
143
144subprocess.call(['bash', '-c', cmd_server])
145