site_tests/kernel_IdlePerf/kernel_IdlePerf.py

#!/usr/bin/python2
#
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import logging
import os
import json
import math
import re
import numpy

from autotest_lib.server import test
from autotest_lib.server.cros import telemetry_runner
from autotest_lib.client.common_lib import error

# This test detects issues with low-throughput latency-sensitive workloads
# caused by entering idle state.
#
# Such loads sleep regularly but also need to wake up and hit deadlines. We've
# observed on some systems that if idle-state is enabled, we miss a lot of
# deadlines (even though the compute capacity is sufficient).
#
# This test runs top_25_smooth with idle-state both enabled and disabled, and
# looks for a discrepancy in the results. This workload is quite noisy, so
# we run multiple times and take N * stdev as the threshold for flagging an
# issue.
#
# In testing, this approach seemed quite robust, if the parameters (repetitions
# and threshold) are set appropriately. Increasing page-set repetitions helped a
# lot (reduces noise), as did selecting a good value for N (which trades off
# false positives vs. false negatives).
#
# Based on testing, we found good results by using 5 indicative pages, setting
# pageset-repetitions to 7, and taking the mean - 2 * stddev as the estimate
# for "we can be confident that the true regression is not worse than this".
#
# This results in under-estimating the regression (typically by around 2 with
# a healthy system), so false alarms should be rare or non-existent. In testing
# 50 iterations with a good and bad system, this identified 100% of regressions
# and non-regressions correctly (in fact mean - 1 * stddev would also have done
# so, but this seems a bit marginal).

# Repeat each page given number of times
PAGESET_REPEAT = 7

# PAGES can be set to a subset of pages to run for a shorter test, or None to
# run all pages in rendering.desktop.
# Simpler pages emphasise the issue more, as the system is more likely to enter
# idle state.
#
# These were selected by running all pages many times (on a system which
# exhibits the issue), and choosing pages which have a high value
# for mean_regression - 2 * stddev - i.e. give the clearest indication of a
# regression.
# The exact page set selected is a mix of real pages (e.g. blogspot_2018) and
# synthetic (e.g. transform_transitions_js_block)
# For a longer test,'twitter_2018', 'wikipedia_2018' can be added to PAGES.
PAGES = ['blogspot_2018', 'transform_transitions_js_block', 'throughput_scrolling_passive_handler']

# Benchmark to run
BENCHMARK = 'rendering.desktop'

# Path to sysfs control file for disabling idle state
DISABLE_PATH = '/sys/devices/system/cpu/cpu{}/cpuidle/state{}/disable'

class kernel_IdlePerf(test.test):
    """
    Server side regression test for performance impact of idle-state.

    This test runs some smoothness tests with and without sleep enabled, to
    check that the impact of enabling sleep is not significant.

    """
    version = 1
    _cleanup_required = False

    def _check_sysfs(self, host):
        # First check that we are on a suitable DUT which offers the ability to
        # disable the idle state
        arch = host.run_output('uname -m')
        if arch != 'aarch64':
            # Idle states differ between CPU architectures, so this test would
            # need further development to support other platforms.
            raise error.TestNAError('Test only supports Arm aarch64 CPUs')
        if not host.path_exists(DISABLE_PATH.format(0, 1)):
            logging.error('sysfs path absent: cannot disable idle state')
            raise error.TestError('Cannot disable idle state')

        # Identify available idle states. state0 is running state; other states
        # should be disabled when disabling idle.
        self.states = []
        state_dirs = host.run_output(
            'ls -1 /sys/devices/system/cpu/cpu0/cpuidle/')
        for state in state_dirs.split('\n'):
            if re.match('state[1-9][0-9]*$', state):
                # Look for dirnames like 'state1' (but exclude 'state0')
                self.states.append(int(state[5:]))
        logging.info('Found idle states: {}'.format(self.states))

        self.cpu_count = int(host.run_output('nproc --all'))
        logging.info('Found {} cpus'.format(self.cpu_count))
        logging.info('Idle enabled = {}'.format(self._is_idle_enabled(host)))

        # From this point on we expect the test to be able to run, so we will
        # need to ensure that the idle state is restored when the test exits
        self._cleanup_required = True
        self._enable_idle(host, False)
        if self._is_idle_enabled(host):
            logging.error('Failed to disable idle state')
            raise error.TestError('Cannot disable idle state')
        self._enable_idle(host, True)
        if not self._is_idle_enabled(host):
            logging.error('Failed to re-enable idle state')
            raise error.TestError('Cannot disable idle state')

    def _is_idle_enabled(self, host):
        return host.run_output('cat ' + DISABLE_PATH.format(0, 1)) == '0'

    def _enable_idle(self, host, enable):
        logging.info('Setting idle enabled to {}'.format(enable))
        x = '0' if enable else '1'
        for cpu in range(0, self.cpu_count):
            for state in self.states:
                path = DISABLE_PATH.format(cpu, state)
                host.run_output('echo {} > {}'.format(x, path))

    def _parse_results_file(self, path):
        with open(path) as fp:
            histogram_json = json.load(fp)

        guids = {x["guid"]: x["values"][0] for x in histogram_json
                    if "guid" in x and "values" in x and len(x["values"]) > 0}

        scores = {}
        for e in histogram_json:
            if "name" in e and e["name"] == "exp_percentage_smooth":
                story_guid = e["diagnostics"]["stories"]
                story = guids[story_guid]
                if story not in scores: scores[story] = []
                scores[story] += [e["sampleValues"][0]]

        for story in scores:
            scores[story] = {
                'raw_exp_percentage_smooth_scores': scores[story],
                'exp_percentage_smooth': numpy.mean(scores[story]),
                'std': numpy.std(scores[story])
            }

        return scores

    def _compare_results(self, idle_enabled, idle_disabled):
        results = {
            'passed': True
        }
        for page in idle_enabled:
            diff = (idle_disabled[page]['exp_percentage_smooth']
                   - idle_enabled[page]['exp_percentage_smooth'])
            diff_std = (math.sqrt(idle_enabled[page]['std'] ** 2
                       + idle_disabled[page]['std'] ** 2))
            passed = (idle_enabled[page]['exp_percentage_smooth'] >=
                     (idle_disabled[page]['exp_percentage_smooth'] - diff_std * 2))
            key = re.sub('\W', '_', page)
            results[key] = {
                'idle_enabled': idle_enabled[page],
                'idle_disabled': idle_disabled[page],
                'difference': diff,
                'difference_std': diff_std,
                'passed': passed
                }
            results['passed'] = results['passed'] and passed
        return results

    def _run_telemetry(self, host, telemetry, enable):
        logging.info('Running telemetry with idle enabled = {}'.format(enable))
        self._enable_idle(host, enable)

        args = ['--pageset-repeat={}'.format(PAGESET_REPEAT)]
        if PAGES:
            stories = r'\|'.join(r'\(^' + p + r'$\)' for p in PAGES)
            story_filter = '--story-filter={}'.format(stories)
            args.append(story_filter)

        logging.info('Running telemetry with args: {}'.format(args))
        result = telemetry.run_telemetry_benchmark(
            BENCHMARK, self, *args)
        if result.status != telemetry_runner.SUCCESS_STATUS:
            raise error.TestFail('Failed to run benchmark')

        # ensure first run doesn't get overwritten by second run
        default_path = os.path.join(self.resultsdir, 'histograms.json')
        if enable:
            unique_path = os.path.join(self.resultsdir,
                                       'results-histograms-idle-enabled.json')
        else:
            unique_path = os.path.join(self.resultsdir,
                                       'results-histograms-idle-disabled.json')
        os.rename(default_path, unique_path)

        return self._parse_results_file(unique_path)

    def run_once(self, host=None, args={}):
        """Run the telemetry scrolling benchmark.

        @param host: host we are running telemetry on.

        """

        logging.info('Checking sysfs')
        self._check_sysfs(host)

        local = args.get('local') == 'True'
        telemetry = telemetry_runner.TelemetryRunner(
                        host, local, telemetry_on_dut=False)

        logging.info('Starting test')
        results_idle   = self._run_telemetry(host, telemetry, True)
        results_noidle = self._run_telemetry(host, telemetry, False)

        # Score is the regression in percentage of smooth frames caused by
        # enabling CPU idle.
        logging.info('Processing results')
        results = self._compare_results(results_idle, results_noidle)

        self.write_perf_keyval(results)

        if not results['passed']:
            raise error.TestFail('enabling CPU idle significantly '
                                 'regresses scrolling performance')

    def cleanup(self, host):
        """Cleanup of the test.

        @param host: host we are running telemetry on.

        """
        if self._cleanup_required:
            logging.info('Restoring idle to enabled')
            self._enable_idle(host, True)