autotest/site_utils/test_push.py

#!/usr/bin/python2
#
# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Tool to validate code in prod branch before pushing to lab.

The script runs push_to_prod suite to verify code in prod branch is ready to be
pushed. Link to design document:
https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit

To verify if prod branch can be pushed to lab, run following command in
chromeos-staging-master2.hot server:
/usr/local/autotest/site_utils/test_push.py -e someone@company.com

The script uses latest gandof stable build as test build by default.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import ast
import datetime
import getpass
import multiprocessing
import os
import re
import subprocess
import sys
import time
import traceback
from six.moves import urllib

import common
try:
    from autotest_lib.frontend import setup_django_environment
    from autotest_lib.frontend.afe import models
    from autotest_lib.frontend.afe import rpc_utils
except ImportError:
    # Unittest may not have Django database configured and will fail to import.
    pass
from autotest_lib.client.common_lib import global_config
from autotest_lib.client.common_lib import priorities
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.frontend.afe import rpc_client_lib
from autotest_lib.server import constants
from autotest_lib.server import site_utils
from autotest_lib.server import utils
from autotest_lib.server.cros import provision
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.site_utils import test_push_common

AUTOTEST_DIR=common.autotest_dir
CONFIG = global_config.global_config

AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)
TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)

MAIL_FROM = 'chromeos-test@google.com'
BUILD_REGEX = 'R[\d]+-[\d]+\.[\d]+\.[\d]+'
RUN_SUITE_COMMAND = 'run_suite.py'
PUSH_TO_PROD_SUITE = 'push_to_prod'
DUMMY_SUITE = 'dummy'
DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB = 30
IMAGE_BUCKET = CONFIG.get_config_value('CROS', 'image_storage_server')
DEFAULT_NUM_DUTS = (
        ('gandof', 4),
        ('quawks', 2),
)

SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'
                              'tab_id=view_job&object_id=(\d+)$')

URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)
URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)

# Some test could be extra / missing or have mismatched results for various
# reasons. Add such test in this list and explain the reason.
_IGNORED_TESTS = [
    # test_push uses a stable image build to test, which is quite behind ToT.
    # The following expectations are correct at ToT, but need to be ignored
    # until stable image is recent enough.

    # TODO(pprabhu): Remove once R70 is stable.
    'dummy_Fail.RetrySuccess',
    'dummy_Fail.RetryFail',
]

# Multiprocessing proxy objects that are used to share data between background
# suite-running processes and main process. The multiprocessing-compatible
# versions are initialized in _main.
_run_suite_output = []
_all_suite_ids = []

DEFAULT_SERVICE_RESPAWN_LIMIT = 2


class TestPushException(Exception):
    """Exception to be raised when the test to push to prod failed."""
    pass

@retry.retry(TestPushException, timeout_min=5, delay_sec=30)
def check_dut_inventory(required_num_duts, pool):
    """Check DUT inventory for each board in the pool specified..

    @param required_num_duts: a dict specifying the number of DUT each platform
                              requires in order to finish push tests.
    @param pool: the pool used by test_push.
    @raise TestPushException: if number of DUTs are less than the requirement.
    """
    print('Checking DUT inventory...')
    pool_label = constants.Labels.POOL_PREFIX + pool
    hosts = AFE.run('get_hosts', status='Ready', locked=False)
    hosts = [h for h in hosts if pool_label in h.get('labels', [])]
    platforms = [host['platform'] for host in hosts]
    current_inventory = {p : platforms.count(p) for p in platforms}
    error_msg = ''
    for platform, req_num in required_num_duts.items():
        curr_num = current_inventory.get(platform, 0)
        if curr_num < req_num:
            error_msg += ('\nRequire %d %s DUTs in pool: %s, only %d are Ready'
                          ' now' % (req_num, platform, pool, curr_num))
    if error_msg:
        raise TestPushException('Not enough DUTs to run push tests. %s' %
                                error_msg)


def powerwash_dut_to_test_repair(hostname, timeout):
    """Powerwash dut to test repair workflow.

    @param hostname: hostname of the dut.
    @param timeout: seconds of the powerwash test to hit timeout.
    @raise TestPushException: if DUT fail to run the test.
    """
    t = models.Test.objects.get(name='platform_Powerwash')
    c = utils.read_file(os.path.join(AUTOTEST_DIR, t.path))
    job_id = rpc_utils.create_job_common(
             'powerwash', priority=priorities.Priority.SUPER,
             control_type='Server', control_file=c, hosts=[hostname])

    end = time.time() + timeout
    while not TKO.get_job_test_statuses_from_db(job_id):
        if time.time() >= end:
            AFE.run('abort_host_queue_entries', job=job_id)
            raise TestPushException(
                'Powerwash test on %s timeout after %ds, abort it.' %
                (hostname, timeout))
        time.sleep(10)
    verify_test_results(job_id,
                        test_push_common.EXPECTED_TEST_RESULTS_POWERWASH)
    # Kick off verify, verify will fail and a repair should be triggered.
    AFE.reverify_hosts(hostnames=[hostname])


def reverify_all_push_duts():
    """Reverify all the push DUTs."""
    print('Reverifying all DUTs.')
    hosts = [h.hostname for h in AFE.get_hosts()]
    AFE.reverify_hosts(hostnames=hosts)


def parse_arguments(argv):
    """Parse arguments for test_push tool.

    @param argv   Argument vector, as for `sys.argv`, including the
                  command name in `argv[0]`.
    @return: Parsed arguments.

    """
    parser = argparse.ArgumentParser(prog=argv[0])
    parser.add_argument('-b', '--board', dest='board', default='gandof',
                        help='Default is gandof.')
    parser.add_argument('-sb', '--shard_board', dest='shard_board',
                        default='quawks',
                        help='Default is quawks.')
    parser.add_argument('-i', '--build', dest='build', default=None,
                        help='Default is the latest stale build of given '
                             'board. Must be a stable build, otherwise AU test '
                             'will fail. (ex: gandolf-release/R54-8743.25.0)')
    parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,
                        help='Default is the latest stable build of given '
                             'board. Must be a stable build, otherwise AU test '
                             'will fail.')
    parser.add_argument('-p', '--pool', dest='pool', default='bvt')
    parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,
                        default=DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB,
                        help='Time in mins to wait before abort the jobs we '
                             'are waiting on. Only for the asynchronous suites '
                             'triggered by create_and_return flag.')
    parser.add_argument('-ud', '--num_duts', dest='num_duts',
                        default=dict(DEFAULT_NUM_DUTS),
                        type=ast.literal_eval,
                        help="Python dict literal that specifies the required"
                        " number of DUTs for each board. E.g {'gandof':4}")
    parser.add_argument('-c', '--continue_on_failure', action='store_true',
                        dest='continue_on_failure',
                        help='All tests continue to run when there is failure')
    parser.add_argument('-sl', '--service_respawn_limit', type=int,
                        default=DEFAULT_SERVICE_RESPAWN_LIMIT,
                        help='If a service crashes more than this, the test '
                             'push is considered failed.')

    arguments = parser.parse_args(argv[1:])

    # Get latest stable build as default build.
    version_map = AFE.get_stable_version_map(AFE.CROS_IMAGE_TYPE)
    if not arguments.build:
        arguments.build = version_map.get_image_name(arguments.board)
    if not arguments.shard_build:
        arguments.shard_build = version_map.get_image_name(
            arguments.shard_board)
    return arguments


def do_run_suite(suite_name, arguments, use_shard=False,
                 create_and_return=False):
    """Call run_suite to run a suite job, and return the suite job id.

    The script waits the suite job to finish before returning the suite job id.
    Also it will echo the run_suite output to stdout.

    @param suite_name: Name of a suite, e.g., dummy.
    @param arguments: Arguments for run_suite command.
    @param use_shard: If true, suite is scheduled for shard board.
    @param create_and_return: If True, run_suite just creates the suite, print
                              the job id, then finish immediately.

    @return: Suite job ID.

    """
    if use_shard:
        board = arguments.shard_board
        build = arguments.shard_build
    else:
        board = arguments.board
        build = arguments.build

    # Remove cros-version label to force provision.
    hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board,
                          locked=False)
    for host in hosts:
        labels_to_remove = [
                l for l in host.labels
                if l.startswith(provision.CROS_VERSION_PREFIX)]
        if labels_to_remove:
            AFE.run('host_remove_labels', id=host.id, labels=labels_to_remove)

        # Test repair work flow on shards, powerwash test will timeout after 7m.
        if use_shard and not create_and_return:
            powerwash_dut_to_test_repair(host.hostname, timeout=420)

    current_dir = os.path.dirname(os.path.realpath(__file__))
    cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),
           '-s', suite_name,
           '-b', board,
           '-i', build,
           '-p', arguments.pool,
           '--minimum_duts', str(arguments.num_duts[board])]
    if create_and_return:
        cmd += ['-c']

    suite_job_id = None

    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)

    while True:
        line = proc.stdout.readline()

        # Break when run_suite process completed.
        if not line and proc.poll() != None:
            break
        print(line.rstrip())
        _run_suite_output.append(line.rstrip())

        if not suite_job_id:
            m = re.match(SUITE_JOB_START_INFO_REGEX, line)
            if m and m.group(1):
                suite_job_id = int(m.group(1))
                _all_suite_ids.append(suite_job_id)

    if not suite_job_id:
        raise TestPushException('Failed to retrieve suite job ID.')

    # If create_and_return specified, wait for the suite to finish.
    if create_and_return:
        end = time.time() + arguments.timeout_min * 60
        while not AFE.get_jobs(id=suite_job_id, finished=True):
            if time.time() < end:
                time.sleep(10)
            else:
                AFE.run('abort_host_queue_entries', job=suite_job_id)
                raise TestPushException(
                        'Asynchronous suite triggered by create_and_return '
                        'flag has timed out after %d mins. Aborting it.' %
                        arguments.timeout_min)

    print('Suite job %s is completed.' % suite_job_id)
    return suite_job_id


def check_dut_image(build, suite_job_id):
    """Confirm all DUTs used for the suite are imaged to expected build.

    @param build: Expected build to be imaged.
    @param suite_job_id: job ID of the suite job.
    @raise TestPushException: If a DUT does not have expected build imaged.
    """
    print('Checking image installed in DUTs...')
    job_ids = [job.id for job in
               models.Job.objects.filter(parent_job_id=suite_job_id)]
    hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]
            for job_id in job_ids]
    hostnames = set([hqe.host.hostname for hqe in hqes])
    for hostname in hostnames:
        found_build = site_utils.get_build_from_afe(hostname, AFE)
        if found_build != build:
            raise TestPushException('DUT is not imaged properly. Host %s has '
                                    'build %s, while build %s is expected.' %
                                    (hostname, found_build, build))


def test_suite(suite_name, expected_results, arguments, use_shard=False,
               create_and_return=False):
    """Call run_suite to start a suite job and verify results.

    @param suite_name: Name of a suite, e.g., dummy
    @param expected_results: A dictionary of test name to test result.
    @param arguments: Arguments for run_suite command.
    @param use_shard: If true, suite is scheduled for shard board.
    @param create_and_return: If True, run_suite just creates the suite, print
                              the job id, then finish immediately.
    """
    suite_job_id = do_run_suite(suite_name, arguments, use_shard,
                                create_and_return)

    # Confirm all DUTs used for the suite are imaged to expected build.
    # hqe.host_id for jobs running in shard is not synced back to master db,
    # therefore, skip verifying dut build for jobs running in shard.
    build_expected = arguments.build
    if not use_shard:
        check_dut_image(build_expected, suite_job_id)

    # Verify test results are the expected results.
    verify_test_results(suite_job_id, expected_results)


def verify_test_results(job_id, expected_results):
    """Verify the test results with the expected results.

    @param job_id: id of the running jobs. For suite job, it is suite_job_id.
    @param expected_results: A dictionary of test name to test result.
    @raise TestPushException: If verify fails.
    """
    print('Comparing test results...')
    test_views = site_utils.get_test_views_from_tko(job_id, TKO)
    summary = test_push_common.summarize_push(test_views, expected_results,
                                              _IGNORED_TESTS)

    # Test link to log can be loaded.
    job_name = '%s-%s' % (job_id, getpass.getuser())
    log_link = URL_PATTERN % (rpc_client_lib.add_protocol(URL_HOST), job_name)
    try:
        urllib.request.urlopen(log_link).read()
    except urllib.error.URLError:
        summary.append('Failed to load page for link to log: %s.' % log_link)

    if summary:
        raise TestPushException('\n'.join(summary))

def test_suite_wrapper(queue, suite_name, expected_results, arguments,
                       use_shard=False, create_and_return=False):
    """Wrapper to call test_suite. Handle exception and pipe it to parent
    process.

    @param queue: Queue to save exception to be accessed by parent process.
    @param suite_name: Name of a suite, e.g., dummy
    @param expected_results: A dictionary of test name to test result.
    @param arguments: Arguments for run_suite command.
    @param use_shard: If true, suite is scheduled for shard board.
    @param create_and_return: If True, run_suite just creates the suite, print
                              the job id, then finish immediately.
    """
    try:
        test_suite(suite_name, expected_results, arguments, use_shard,
                   create_and_return)
    except Exception:
        # Store the whole exc_info leads to a PicklingError.
        except_type, except_value, tb = sys.exc_info()
        queue.put((except_type, except_value, traceback.extract_tb(tb)))


def check_queue(queue):
    """Check the queue for any exception being raised.

    @param queue: Queue used to store exception for parent process to access.
    @raise: Any exception found in the queue.
    """
    if queue.empty():
        return
    exc_info = queue.get()
    # Raise the exception with original backtrace.
    print('Original stack trace of the exception:\n%s' % exc_info[2])
    raise exc_info[0](exc_info[1])


def _run_test_suites(arguments):
    """Run the actual tests that comprise the test_push."""
    # Use daemon flag will kill child processes when parent process fails.
    use_daemon = not arguments.continue_on_failure
    queue = multiprocessing.Queue()

    push_to_prod_suite = multiprocessing.Process(
            target=test_suite_wrapper,
            args=(queue, PUSH_TO_PROD_SUITE,
                  test_push_common.EXPECTED_TEST_RESULTS, arguments))
    push_to_prod_suite.daemon = use_daemon
    push_to_prod_suite.start()

    # suite test with --create_and_return flag
    asynchronous_suite = multiprocessing.Process(
            target=test_suite_wrapper,
            args=(queue, DUMMY_SUITE,
                  test_push_common.EXPECTED_TEST_RESULTS_DUMMY,
                  arguments, True, True))
    asynchronous_suite.daemon = True
    asynchronous_suite.start()

    while push_to_prod_suite.is_alive() or asynchronous_suite.is_alive():
        check_queue(queue)
        time.sleep(5)
    check_queue(queue)
    push_to_prod_suite.join()
    asynchronous_suite.join()


def check_service_crash(respawn_limit, start_time):
  """Check whether scheduler or host_scheduler crash during testing.

  Since the testing push is kicked off at the beginning of a given hour, the way
  to check whether a service is crashed is to check whether the times of the
  service being respawn during testing push is over the respawn_limit.

  @param respawn_limit: The maximum number of times the service is allowed to
                        be respawn.
  @param start_time: The time that testing push is kicked off.
  """
  def _parse(filename_prefix, filename):
    """Helper method to parse the time of the log.

    @param filename_prefix: The prefix of the filename.
    @param filename: The name of the log file.
    """
    return datetime.datetime.strptime(filename[len(filename_prefix):],
                                      "%Y-%m-%d-%H.%M.%S")

  services = ['scheduler', 'host_scheduler']
  logs = os.listdir('%s/logs/' % AUTOTEST_DIR)
  curr_time = datetime.datetime.now()

  error_msg = ''
  for service in services:
    log_prefix = '%s.log.' % service
    respawn_count = sum(1 for l in logs if l.startswith(log_prefix)
                        and start_time <= _parse(log_prefix, l) <= curr_time)

    if respawn_count > respawn_limit:
      error_msg += ('%s has been respawned %s times during testing push at %s. '
                    'It is very likely crashed. Please check!\n' %
                    (service, respawn_count,
                     start_time.strftime("%Y-%m-%d-%H")))
  if error_msg:
    raise TestPushException(error_msg)


_SUCCESS_MSG = """
All staging tests completed successfully.

Instructions for pushing to prod are available at
https://goto.google.com/autotest-to-prod
"""


def _main(arguments):
    """Run test and promote repo branches if tests succeed.

    @param arguments: command line arguments.
    """

    # TODO Use chromite.lib.parallel.Manager instead, to workaround the
    # too-long-tmp-path problem.
    mpmanager = multiprocessing.Manager()
    # These are globals used by other functions in this module to communicate
    # back from worker processes.
    global _run_suite_output
    _run_suite_output = mpmanager.list()
    global _all_suite_ids
    _all_suite_ids = mpmanager.list()

    try:
        start_time = datetime.datetime.now()
        reverify_all_push_duts()
        time.sleep(15) # Wait for the verify test to start.
        check_dut_inventory(arguments.num_duts, arguments.pool)
        _run_test_suites(arguments)
        check_service_crash(arguments.service_respawn_limit, start_time)
        print(_SUCCESS_MSG)
    except Exception:
        # Abort running jobs unless flagged to continue when there is a failure.
        if not arguments.continue_on_failure:
            for suite_id in _all_suite_ids:
                if AFE.get_jobs(id=suite_id, finished=False):
                    AFE.run('abort_host_queue_entries', job=suite_id)
        raise


def main():
    """Entry point."""
    arguments = parse_arguments(sys.argv)
    _main(arguments)


if __name__ == '__main__':
    main()