1#! /usr/bin/python 2 3# Copyright 2017 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7""" 8Swarming bot manager running on servers that hold swarming bots. 9This manages running swarming bots and routinely recovers any that die. 10""" 11 12import argparse 13import logging 14import signal 15import socket 16import sys 17import time 18import urllib2 19 20import common 21from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 22from autotest_lib.site_utils.chromeos_proxy import swarming_bots 23 24from chromite.lib import metrics 25from chromite.lib import ts_mon_config 26 27 28# The seconds between consequent bot check. 29CHECK_INTERVAL = 180 30 31_shut_down = False 32 33metrics_template = 'chromeos/autotest/swarming/bot_manager/%s' 34 35def _parse_args(args): 36 """Parse system arguments.""" 37 parser = argparse.ArgumentParser( 38 description='Manage the set of swarming bots running on a server') 39 parser.add_argument('afe', type=str, 40 help='AFE to get server role and status.') 41 # TODO(xixuan): refactor together with swarming_bots. 42 parser.add_argument( 43 'id_range', type=str, 44 help='A range of integer, each bot created will be labeled ' 45 'with an id from this range. E.g. "1-200"') 46 parser.add_argument( 47 'working_dir', type=str, 48 help='A working directory where bots will store files ' 49 'generated at runtime') 50 parser.add_argument( 51 '-p', '--swarming_proxy', type=str, dest='swarming_proxy', 52 default=swarming_bots.DEFAULT_SWARMING_PROXY, 53 help='The URL of the swarming instance to talk to, ' 54 'Default to the one specified in global config') 55 parser.add_argument( 56 '-f', '--log_file', dest='log_file', 57 help='Path to the log file.') 58 parser.add_argument( 59 '-v', '--verbose', dest='verbose', action='store_true', 60 help='Verbose mode') 61 62 return parser.parse_args(args) 63 64 65def handle_signal(signum, frame): 66 """Function called when being killed. 67 68 @param signum: The signal received. 69 @param frame: Ignored. 70 """ 71 del signum 72 del frame 73 74 _shut_down = True 75 76 77def is_server_in_prod(server_name, afe): 78 """Validate server's role and status. 79 80 @param server_name: the server name to be validated. 81 @param afe: the afe server to get role & status info in server_db. 82 83 @return: A boolean value, True when the server_name is in prod, False 84 otherwise, or if RPC fails. 85 """ 86 logging.info('Validating server: %s', server_name) 87 afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10, 88 server=afe) 89 is_prod_proxy_server = False 90 try: 91 if afe.run('get_servers', hostname=server_name, 92 status='primary', role='golo_proxy'): 93 is_prod_proxy_server = True 94 95 except urllib2.URLError as e: 96 logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e)) 97 finally: 98 metrics.Counter(metrics_template % 'server_in_prod_check').increment( 99 fields={'success': is_prod_proxy_server}) 100 return is_prod_proxy_server 101 102 103@metrics.SecondsTimerDecorator(metrics_template % 'tick') 104def tick(afe, bot_manager): 105 """One tick for swarming bot manager. 106 107 @param afe: the afe to check server role. 108 @param bot_manager: a swarming_bots.BotManager instance. 109 """ 110 if is_server_in_prod(socket.getfqdn(), afe): 111 bot_manager.check() 112 113 114def main(args): 115 """Main func. 116 117 @args: A list of system arguments. 118 """ 119 args = _parse_args(args) 120 swarming_bots.setup_logging(args.verbose, args.log_file) 121 122 if not args.swarming_proxy: 123 logging.error( 124 'No swarming proxy instance specified. ' 125 'Specify swarming_proxy in [CROS] in shadow_config, ' 126 'or use --swarming_proxy') 127 return 1 128 129 if not args.swarming_proxy.startswith('https://'): 130 swarming_proxy = 'https://' + args.swarming_proxy 131 else: 132 swarming_proxy = args.swarming_proxy 133 134 global _shut_down 135 logging.info("Setting signal handler.") 136 signal.signal(signal.SIGINT, handle_signal) 137 signal.signal(signal.SIGTERM, handle_signal) 138 139 bot_manager = swarming_bots.BotManager( 140 swarming_bots.parse_range(args.id_range), 141 args.working_dir, 142 args.swarming_proxy) 143 is_prod = False 144 retryable = True 145 with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True): 146 while not _shut_down: 147 tick(args.afe, bot_manager) 148 time.sleep(CHECK_INTERVAL) 149 150 151if __name__ == '__main__': 152 sys.exit(main(sys.argv[1:])) 153