• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""
5Test to generate the AFDO profile for a set of ChromeOS benchmarks.
6
7This will run a pre-determined set of benchmarks on the DUT under
8the monitoring of the linux "perf" tool. The resulting perf.data
9file will then be copied to Google Storage (GS) where it can be
10used by the AFDO optimized build.
11
12Given that the telemetry benchmarks are quite unstable on ChromeOS at
13this point, this test also supports a mode where the benchmarks are
14executed outside of the telemetry framework. It is not the same as
15executing the benchmarks under telemetry because there is no telemetry
16measurement taken but, for the purposes of profiling Chrome, it should
17be pretty close.
18
19Example invocation:
20/usr/bin/test_that --debug --board=lumpy <DUT IP>
21  --args="ignore_failures=True local=True gs_test_location=True"
22  telemetry_AFDOGenerate
23"""
24
25import bz2
26import logging
27import os
28import time
29
30from autotest_lib.client.common_lib import error
31from autotest_lib.server import autotest
32from autotest_lib.server import test
33from autotest_lib.server import utils
34from autotest_lib.server.cros import filesystem_util
35from autotest_lib.server.cros import telemetry_runner
36from autotest_lib.site_utils import test_runner_utils
37from contextlib import contextmanager
38
39# These are arguments to the linux "perf" tool.
40# The -e value is processor specific and comes from the Intel SDM vol 3b
41PROFILER_ARGS = 'record -a -e r20c4 -c 500000 -b'
42
43WAIT_FOR_CMD_TIMEOUT_SECS = 60
44
45# Reuse ssh and scp settings from telemetry_Crosperf
46RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
47DUT_SCP_OPTIONS = ' '.join([
48        '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
49        '-o BatchMode=yes', '-o ConnectTimeout=30',
50        '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
51        '-o ConnectionAttempts=4', '-o Protocol=2'
52])
53DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf'
54
55_WAIT_CMD_TEMPLATE = """\
56for _ in {1..%(timeout)d}; do \
57  ps %(pid)d >/dev/null || break; \
58  sleep 1; \
59done; \
60! ps %(pid)d >/dev/null \
61"""
62
63
64def _wait_for_process(host, pid, timeout=-1):
65    """Waits for a process on the DUT to terminate.
66
67    @param host: A host object representing the DUT.
68    @param pid: The process ID (integer).
69    @param timeout: Number of seconds to wait; default is wait forever.
70    """
71    wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout}
72    return host.run(wait_cmd, ignore_status=True).exit_status
73
74
75# List of benchmarks to run to capture profile information. This is
76# based on the "superhero" list and other telemetry benchmarks. Goal is
77# to have a short list that is as representative as possible and takes a
78# short time to execute. At this point the list of benchmarks is in flux.
79TELEMETRY_AFDO_BENCHMARKS = (
80        # page_cycler tests are deprecated. Replace them with loading.desktop.
81        ('loading.desktop', ('--pageset-repeat=1',
82                             '--story-tag-filter=typical',
83                             '--legacy-json-trace-format')),
84        ('loading.desktop', ('--pageset-repeat=1',
85                             '--story-tag-filter=intl_ja_zh',
86                             '--legacy-json-trace-format')),
87        ('rendering.desktop',
88         ('--story-tag-filter=tough_canvas',
89          '--story-filter="bouncing\\*\\|canvas\\*\\|microsoft\\*"',
90          '--legacy-json-trace-format')),
91        ('octane', ('--legacy-json-trace-format',)),
92        ('kraken', ('--legacy-json-trace-format',)),
93        ('speedometer2', ('--legacy-json-trace-format',)),
94)
95
96# Temporarily disable this benchmark because it is failing a
97# lot. Filed chromium:590127
98# ('smoothness.tough_webgl_cases',)
99
100# Some benchmarks removed from the profile set:
101# 'page_cycler.morejs' -> uninteresting, seems to fail frequently,
102# 'page_cycler.moz' -> seems very old.
103# 'media.tough_video_cases' -> removed this because it does not bring
104#                              any benefit and takes more than 12 mins
105
106# List of boards where this test can be run.  Currently, it needs a
107# machines with at least 4GB of memory or 2GB of /tmp.
108# This must be consistent with chromite.
109GCC_BOARDS = ['lumpy']
110
111# Should be disjoint with GCC_BOARDS
112LLVM_BOARDS = ['chell']
113
114# FIXME(tcwang): only used for testing Async AFDO generation builders.
115# Remove this after testing is done.
116# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated
117# by samus is not suitable for production in both master and branch.
118# So it's suitable to test generation profiles but not actually use it.
119LLVM_BOARDS_ASYNC = ['samus']
120
121
122class telemetry_AFDOGenerate(test.test):
123    """
124    Run one or more telemetry benchmarks under the "perf" monitoring
125    tool, generate a "perf.data" file and upload to GS for comsumption
126    by the AFDO optimized build.
127    """
128    version = 1
129
130    def scp_perf_data(self, dut, host_dir):
131        """Copy perf data from dut.
132
133        @param dut: The autotest host object representing DUT.
134        @param host_dir: The directory on host to put the file .
135
136        @returns status code for scp command.
137        """
138        cmd = []
139        src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR,
140                                  'perf.data'))
141        cmd.extend(['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v', src, host_dir])
142        command = ' '.join(cmd)
143
144        logging.debug('Retrieving Perf Data: %s', command)
145        try:
146            result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS)
147            exit_code = result.exit_status
148        except Exception as e:
149            logging.error('Failed to retrieve results: %s', e)
150            raise
151
152        logging.debug('command return value: %d', exit_code)
153        return exit_code
154
155    @contextmanager
156    def perf_on_dut(self):
157        """Start and kill perf process on DUT.
158        """
159        logging.info('Starting perf process in background.')
160        perf_cmd = 'nohup perf %s -o %s/perf.data' \
161                    % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR)
162        perf_pid = self._host.run_background(perf_cmd)
163
164        try:
165            # Use `kill -0` to check whether the perf process is alive
166            verify_cmd = 'kill -0 %s' % perf_pid
167            if self._host.run(verify_cmd, ignore_status=True).exit_status != 0:
168                logging.error('Perf process not started correctly on DUT')
169                raise RuntimeError
170            logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd)
171            yield
172        finally:
173            # Check if process is still alive after benchmark run, if yes,
174            # then kill it with -2 (which is SIGINT).
175            kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid
176            if self._host.run(kill_cmd, ignore_status=True).exit_status != 0:
177                logging.error('Perf process is not killed correctly on DUT.')
178                raise RuntimeError
179            # Perf process may not be terminated right after the kill command,
180            # wait until perf process finishes.
181            status = _wait_for_process(self._host, int(perf_pid),
182                                       WAIT_FOR_CMD_TIMEOUT_SECS)
183            if status != 0:
184                logging.error('Error waiting for perf process to be killed.')
185                raise RuntimeError
186            logging.info('Perf has been killed on DUT.')
187
188        status = self.scp_perf_data(self._host, self.profdir)
189        if status != 0:
190            logging.error('Cannot copy perf.data file to host.')
191            raise RuntimeError
192
193    def run_once(self, host, args):
194        """Run a set of telemetry benchmarks.
195
196        @param host: Host machine where test is run
197        @param args: A dictionary of the arguments that were passed
198                to this test.
199        @returns None.
200        """
201        self._host = host
202        host_board = host.get_board().split(':')[1]
203
204        if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS
205                or host_board in LLVM_BOARDS_ASYNC):
206            raise error.TestFail(
207                    'This test cannot be run on board %s' % host_board)
208
209        self._parse_args(args)
210
211        # Remove write protection on host, as now telemetry code will
212        # try to remove write protection that causes the machine to
213        # reboot and remount during run_benchmark. We want to avoid it.
214        filesystem_util.make_rootfs_writable(self._host)
215
216        with self.perf_on_dut():
217            if self._minimal_telemetry:
218                self._run_tests_minimal_telemetry()
219            else:
220                self._telemetry_runner = telemetry_runner.TelemetryRunner(
221                        self._host, self._local, telemetry_on_dut=False)
222
223                for benchmark_info in TELEMETRY_AFDO_BENCHMARKS:
224                    benchmark = benchmark_info[0]
225                    args = (
226                    ) if len(benchmark_info) == 1 else benchmark_info[1]
227                    try:
228                        self._run_test_with_retry(benchmark, *args)
229                    except error.TestBaseException:
230                        if not self._ignore_failures:
231                            raise
232                        logging.info('Ignoring failure from benchmark %s.',
233                                     benchmark)
234
235    def after_run_once(self):
236        """After the profile information has been collected, compress it
237        and upload it to GS
238        """
239        PERF_FILE = 'perf.data'
240        COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data'
241        perf_data = os.path.join(self.profdir, PERF_FILE)
242        comp_data = os.path.join(self.profdir,
243                                 COMP_PERF_FILE % (self._arch, self._version))
244        compressed = self._compress_file(perf_data, comp_data)
245        self._gs_upload(compressed, os.path.basename(compressed))
246
247        # Also create copy of this file using "LATEST" as version so
248        # it can be found in case the builder is looking for a version
249        # number that does not match. It is ok to use a slighly old
250        # version of the this file for the optimized build
251        latest_data = COMP_PERF_FILE % (self._arch, 'LATEST')
252        latest_compressed = self._get_compressed_name(latest_data)
253        self._gs_upload(compressed, latest_compressed)
254
255        # So that they are not uploaded along with the logs.
256        os.remove(compressed)
257        os.remove(perf_data)
258
259    def _parse_args(self, args):
260        """Parses input arguments to this autotest.
261
262        @param args: Options->values dictionary.
263        @raises error.TestFail if a bad option is passed.
264        """
265
266        # Set default values for the options.
267        # Architecture for which we are collecting afdo data.
268        self._arch = 'amd64'
269        # Use an alternate GS location where everyone can write.
270        # Set default depending on whether this is executing in
271        # the lab environment or not
272        self._gs_test_location = not utils.host_is_in_lab_zone(
273                self._host.hostname)
274        # Ignore individual test failures.
275        self._ignore_failures = False
276        # Use local copy of telemetry instead of using the dev server copy.
277        self._local = False
278        # Chrome version to which the AFDO data corresponds.
279        self._version, _ = self._host.get_chrome_version()
280        # Try to use the minimal support from Telemetry. The Telemetry
281        # benchmarks in ChromeOS are too flaky at this point. So, initially,
282        # this will be set to True by default.
283        self._minimal_telemetry = False
284
285        for option_name, value in args.iteritems():
286            if option_name == 'arch':
287                self._arch = value
288            elif option_name == 'gs_test_location':
289                self._gs_test_location = (value == 'True')
290            elif option_name == 'ignore_failures':
291                self._ignore_failures = (value == 'True')
292            elif option_name == 'local':
293                self._local = (value == 'True')
294            elif option_name == 'minimal_telemetry':
295                self._minimal_telemetry = (value == 'True')
296            elif option_name == 'version':
297                self._version = value
298            else:
299                raise error.TestFail('Unknown option passed: %s' % option_name)
300
301    def _run_test(self, benchmark, *args):
302        """Run the benchmark using Telemetry.
303
304        @param benchmark: Name of the benchmark to run.
305        @param args: Additional arguments to pass to the telemetry execution
306                     script.
307        @raises Raises error.TestFail if execution of test failed.
308                Also re-raise any exceptions thrown by run_telemetry benchmark.
309        """
310        try:
311            logging.info('Starting run for Telemetry benchmark %s', benchmark)
312            start_time = time.time()
313            result = self._telemetry_runner.run_telemetry_benchmark(
314                    benchmark, None, *args)
315            end_time = time.time()
316            logging.info('Completed Telemetry benchmark %s in %f seconds',
317                         benchmark, end_time - start_time)
318        except error.TestBaseException as e:
319            end_time = time.time()
320            logging.info(
321                    'Got exception from Telemetry benchmark %s '
322                    'after %f seconds. Exception: %s', benchmark,
323                    end_time - start_time, str(e))
324            raise
325
326        # We dont generate any keyvals for this run. This is not
327        # an official run of the benchmark. We are just running it to get
328        # a profile from it.
329
330        if result.status is telemetry_runner.SUCCESS_STATUS:
331            logging.info('Benchmark %s succeeded', benchmark)
332        else:
333            raise error.TestFail('An error occurred while executing'
334                                 ' benchmark: %s' % benchmark)
335
336    def _run_test_with_retry(self, benchmark, *args):
337        """Run the benchmark using Telemetry. Retry in case of failure.
338
339        @param benchmark: Name of the benchmark to run.
340        @param args: Additional arguments to pass to the telemetry execution
341                     script.
342        @raises Re-raise any exceptions thrown by _run_test.
343        """
344
345        tried = False
346        while True:
347            try:
348                self._run_test(benchmark, *args)
349                logging.info('Benchmark %s succeeded on %s try', benchmark,
350                             'first' if not tried else 'second')
351                break
352            except error.TestBaseException:
353                if not tried:
354                    tried = True
355                    logging.info('Benchmark %s failed. Retrying ...',
356                                 benchmark)
357                else:
358                    logging.info('Benchmark %s failed twice. Not retrying',
359                                 benchmark)
360                    raise
361
362    def _run_tests_minimal_telemetry(self):
363        """Run the benchmarks using the minimal support from Telemetry.
364
365        The benchmarks are run using a client side autotest test. This test
366        will control Chrome directly using the chrome.Chrome support and it
367        will ask Chrome to display the benchmark pages directly instead of
368        using the "page sets" and "measurements" support from Telemetry.
369        In this way we avoid using Telemetry benchmark support which is not
370        stable on ChromeOS yet.
371        """
372        AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient'
373
374        # Execute the client side test.
375        client_at = autotest.Autotest(self._host)
376        client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='')
377
378    @staticmethod
379    def _get_compressed_name(name):
380        """Given a file name, return bz2 compressed name.
381        @param name: Name of uncompressed file.
382        @returns name of compressed file.
383        """
384        return name + '.bz2'
385
386    @staticmethod
387    def _compress_file(unc_file, com_file):
388        """Compresses specified file with bz2.
389
390        @param unc_file: name of file to compress.
391        @param com_file: prefix name of compressed file.
392        @raises error.TestFail if compression failed
393        @returns Name of compressed file.
394        """
395        dest = ''
396        with open(unc_file, 'r') as inp:
397            dest = telemetry_AFDOGenerate._get_compressed_name(com_file)
398            with bz2.BZ2File(dest, 'w') as out:
399                for data in inp:
400                    out.write(data)
401        if not dest or not os.path.isfile(dest):
402            raise error.TestFail('Could not compress %s' % unc_file)
403        return dest
404
405    def _gs_upload(self, local_file, remote_basename):
406        """Uploads file to google storage specific location.
407
408        @param local_file: name of file to upload.
409        @param remote_basename: basename of remote file.
410        @raises error.TestFail if upload failed.
411        @returns nothing.
412        """
413        GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s'
414        GS_LLVM_DEST = 'gs://chromeos-prebuilt/afdo-job/llvm/%s'
415        GS_LLVM_ASYNC_DEST = \
416            'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s'
417        GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s'
418        GS_ACL = 'project-private'
419
420        board = self._host.get_board().split(':')[1]
421
422        if self._gs_test_location:
423            gs_dest = GS_TEST_DEST
424        elif board in GCC_BOARDS:
425            gs_dest = GS_GCC_DEST
426        elif board in LLVM_BOARDS:
427            gs_dest = GS_LLVM_DEST
428        elif board in LLVM_BOARDS_ASYNC:
429            gs_dest = GS_LLVM_ASYNC_DEST
430            GS_ACL = 'public-read'
431        else:
432            raise error.TestFail('This test cannot be run on board %s' % board)
433
434        remote_file = gs_dest % remote_basename
435
436        logging.info('About to upload to GS: %s', remote_file)
437        if not utils.gs_upload(
438                local_file, remote_file, GS_ACL, result_dir=self.resultsdir):
439            logging.info('Failed upload to GS: %s', remote_file)
440            raise error.TestFail(
441                    'Unable to gs upload %s to %s' % (local_file, remote_file))
442
443        logging.info('Successfull upload to GS: %s', remote_file)
444