#!/usr/bin/python # # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import logging import os import json import math import re from autotest_lib.server import test from autotest_lib.server.cros import telemetry_runner from autotest_lib.client.common_lib import error # This test detects issues with low-throughput latency-sensitive workloads # caused by entering idle state. # # Such loads sleep regularly but also need to wake up and hit deadlines. We've # observed on some systems that if idle-state is enabled, we miss a lot of # deadlines (even though the compute capacity is sufficient). # # This test runs top_25_smooth with idle-state both enabled and disabled, and # looks for a discrepancy in the results. This workload is quite noisy, so # we run multiple times and take N * stdev as the threshold for flagging an # issue. # # In testing, this approach seemed quite robust, if the parameters (repetitions # and threshold) are set appropriately. Increasing page-set repetitions helped a # lot (reduces noise), as did selecting a good value for N (which trades off # false positives vs. false negatives). # # Based on testing, we found good results by using 5 indicative pages, setting # pageset-repetitions to 7, and taking the mean - 2 * stddev as the estimate # for "we can be confident that the true regression is not worse than this". # # This results in under-estimating the regression (typically by around 2 with # a healthy system), so false alarms should be rare or non-existent. In testing # 50 iterations with a good and bad system, this identified 100% of regressions # and non-regressions correctly (in fact mean - 1 * stddev would also have done # so, but this seems a bit marginal). # Repeat each page given number of times PAGESET_REPEAT = 7 # PAGES can be set to a subset of pages to run for a shorter test, or None to # run all pages in top_25_smooth. # Simpler pages emphasise the issue more, as the system is more likely to enter # idle state. # # These were selected by running all pages many times (on a system which # exhibits the issue), and choosing the 5 pages which have the highest values # for mean_regression - 2 * stddev - i.e. give the clearest indication of a # regression. PAGES = ['games.yahoo', 'Blogger', 'LinkedIn', 'cats', 'booking'] # Path to sysfs control file for disabling idle state DISABLE_PATH = '/sys/devices/system/cpu/cpu{}/cpuidle/state{}/disable' class kernel_IdlePerf(test.test): """ Server side regression test for performance impact of idle-state. This test runs some smoothness tests with and without sleep enabled, to check that the impact of enabling sleep is not significant. """ version = 1 _cleanup_required = False def _check_sysfs(self, host): # First check that we are on a suitable DUT which offers the ability to # disable the idle state arch = host.run_output('uname -m') if arch != 'aarch64': # Idle states differ between CPU architectures, so this test would # need further development to support other platforms. raise error.TestNAError('Test only supports Arm aarch64 CPUs') if not host.path_exists(DISABLE_PATH.format(0, 1)): logging.error('sysfs path absent: cannot disable idle state') raise error.TestError('Cannot disable idle state') # Identify available idle states. state0 is running state; other states # should be disabled when disabling idle. self.states = [] state_dirs = host.run_output( 'ls -1 /sys/devices/system/cpu/cpu0/cpuidle/') for state in state_dirs.split('\n'): if re.match('state[1-9][0-9]*$', state): # Look for dirnames like 'state1' (but exclude 'state0') self.states.append(int(state[5:])) logging.info('Found idle states: {}'.format(self.states)) self.cpu_count = int(host.run_output('nproc --all')) logging.info('Found {} cpus'.format(self.cpu_count)) logging.info('Idle enabled = {}'.format(self._is_idle_enabled(host))) # From this point on we expect the test to be able to run, so we will # need to ensure that the idle state is restored when the test exits self._cleanup_required = True self._enable_idle(host, False) if self._is_idle_enabled(host): logging.error('Failed to disable idle state') raise error.TestError('Cannot disable idle state') self._enable_idle(host, True) if not self._is_idle_enabled(host): logging.error('Failed to re-enable idle state') raise error.TestError('Cannot disable idle state') def _is_idle_enabled(self, host): return host.run_output('cat ' + DISABLE_PATH.format(0, 1)) == '0' def _enable_idle(self, host, enable): logging.info('Setting idle enabled to {}'.format(enable)) x = '0' if enable else '1' for cpu in range(0, self.cpu_count): for state in self.states: path = DISABLE_PATH.format(cpu, state) host.run_output('echo {} > {}'.format(x, path)) def _parse_results_file(self, path): def _mean(values): return sum(values) / float(len(values)) with open(path) as fp: histogram_json = json.load(fp) scores = {} # list of % smooth scores for each page and for each pageset-repetition for page in histogram_json['charts']['percentage_smooth']: if page == 'summary': continue page_result = histogram_json['charts']['percentage_smooth'][page] scores[page] = {'percentage_smooth': _mean(page_result['values']), 'std': page_result['std'] } return scores def _compare_results(self, idle_enabled, idle_disabled): results = { 'passed': True } for page in idle_enabled: diff = (idle_disabled[page]['percentage_smooth'] - idle_enabled[page]['percentage_smooth']) diff_std = (math.sqrt(idle_enabled[page]['std'] ** 2 + idle_disabled[page]['std'] ** 2)) passed = (idle_enabled[page]['percentage_smooth'] > (idle_disabled[page]['percentage_smooth'] - diff_std * 2)) key = re.sub('\W', '_', page) results[key] = { 'idle_enabled': idle_enabled[page], 'idle_disabled': idle_disabled[page], 'difference': diff, 'difference_std': diff_std, 'passed': passed } results['passed'] = results['passed'] and passed return results def _run_telemetry(self, host, telemetry, enable): logging.info('Running telemetry with idle enabled = {}'.format(enable)) self._enable_idle(host, enable) args = ['--pageset-repeat={}'.format(PAGESET_REPEAT)] if PAGES: stories = r'\|'.join(r'\(' + p + r'\)' for p in PAGES) story_filter = '--story-filter={}'.format(stories) args.append(story_filter) logging.info('Running telemetry with args: {}'.format(args)) result = telemetry.run_telemetry_benchmark( 'smoothness.top_25_smooth', self, *args) if result.status != telemetry_runner.SUCCESS_STATUS: raise error.TestFail('Failed to run benchmark') # ensure first run doesn't get overwritten by second run default_path = os.path.join(self.resultsdir, 'results-chart.json') if enable: unique_path = os.path.join(self.resultsdir, 'results-chart-idle-enabled.json') else: unique_path = os.path.join(self.resultsdir, 'results-chart-idle-disabled.json') os.rename(default_path, unique_path) return self._parse_results_file(unique_path) def run_once(self, host=None, args={}): """Run the telemetry scrolling benchmark. @param host: host we are running telemetry on. """ logging.info('Checking sysfs') self._check_sysfs(host) local = args.get('local') == 'True' telemetry = telemetry_runner.TelemetryRunner( host, local, telemetry_on_dut=False) logging.info('Starting test') results_idle = self._run_telemetry(host, telemetry, True) results_noidle = self._run_telemetry(host, telemetry, False) # Score is the regression in percentage of smooth frames caused by # enabling CPU idle. logging.info('Processing results') results = self._compare_results(results_idle, results_noidle) self.write_perf_keyval(results) if not results['passed']: raise error.TestFail('enabling CPU idle significantly ' 'regresses scrolling performance') def cleanup(self, host): """Cleanup of the test. @param host: host we are running telemetry on. """ if self._cleanup_required: logging.info('Restoring idle to enabled') self._enable_idle(host, True)