1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4""" 5Test to generate the AFDO profile for a set of ChromeOS benchmarks. 6 7This will run a pre-determined set of benchmarks on the DUT under 8the monitoring of the linux "perf" tool. The resulting perf.data 9file will then be copied to Google Storage (GS) where it can be 10used by the AFDO optimized build. 11 12Given that the telemetry benchmarks are quite unstable on ChromeOS at 13this point, this test also supports a mode where the benchmarks are 14executed outside of the telemetry framework. It is not the same as 15executing the benchmarks under telemetry because there is no telemetry 16measurement taken but, for the purposes of profiling Chrome, it should 17be pretty close. 18 19Example invocation: 20/usr/bin/test_that --debug --board=lumpy <DUT IP> 21 --args="ignore_failures=True local=True gs_test_location=True" 22 telemetry_AFDOGenerate 23""" 24 25import bz2 26import logging 27import os 28import time 29 30from autotest_lib.client.common_lib import error 31from autotest_lib.server import autotest 32from autotest_lib.server import test 33from autotest_lib.server import utils 34from autotest_lib.server.cros import filesystem_util 35from autotest_lib.server.cros import telemetry_runner 36from autotest_lib.site_utils import test_runner_utils 37from contextlib import contextmanager 38 39# These are arguments to the linux "perf" tool. 40# The -e value is processor specific and comes from the Intel SDM vol 3b 41PROFILER_ARGS = 'record -a -e r20c4 -c 500000 -b' 42 43WAIT_FOR_CMD_TIMEOUT_SECS = 60 44 45# Reuse ssh and scp settings from telemetry_Crosperf 46RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH 47DUT_SCP_OPTIONS = ' '.join([ 48 '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null', 49 '-o BatchMode=yes', '-o ConnectTimeout=30', 50 '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3', 51 '-o ConnectionAttempts=4', '-o Protocol=2' 52]) 53DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf' 54 55_WAIT_CMD_TEMPLATE = """\ 56for _ in {1..%(timeout)d}; do \ 57 ps %(pid)d >/dev/null || break; \ 58 sleep 1; \ 59done; \ 60! ps %(pid)d >/dev/null \ 61""" 62 63 64def _wait_for_process(host, pid, timeout=-1): 65 """Waits for a process on the DUT to terminate. 66 67 @param host: A host object representing the DUT. 68 @param pid: The process ID (integer). 69 @param timeout: Number of seconds to wait; default is wait forever. 70 """ 71 wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout} 72 return host.run(wait_cmd, ignore_status=True).exit_status 73 74 75# List of benchmarks to run to capture profile information. This is 76# based on the "superhero" list and other telemetry benchmarks. Goal is 77# to have a short list that is as representative as possible and takes a 78# short time to execute. At this point the list of benchmarks is in flux. 79TELEMETRY_AFDO_BENCHMARKS = ( 80 # page_cycler tests are deprecated. Replace them with loading.desktop. 81 ('loading.desktop', ('--pageset-repeat=1', 82 '--story-tag-filter=typical', 83 '--legacy-json-trace-format')), 84 ('loading.desktop', ('--pageset-repeat=1', 85 '--story-tag-filter=intl_ja_zh', 86 '--legacy-json-trace-format')), 87 ('rendering.desktop', 88 ('--story-tag-filter=tough_canvas', 89 '--story-filter="bouncing\\*\\|canvas\\*\\|microsoft\\*"', 90 '--legacy-json-trace-format')), 91 ('octane', ('--legacy-json-trace-format',)), 92 ('kraken', ('--legacy-json-trace-format',)), 93 ('speedometer2', ('--legacy-json-trace-format',)), 94) 95 96# Temporarily disable this benchmark because it is failing a 97# lot. Filed chromium:590127 98# ('smoothness.tough_webgl_cases',) 99 100# Some benchmarks removed from the profile set: 101# 'page_cycler.morejs' -> uninteresting, seems to fail frequently, 102# 'page_cycler.moz' -> seems very old. 103# 'media.tough_video_cases' -> removed this because it does not bring 104# any benefit and takes more than 12 mins 105 106# List of boards where this test can be run. Currently, it needs a 107# machines with at least 4GB of memory or 2GB of /tmp. 108# This must be consistent with chromite. 109GCC_BOARDS = ['lumpy'] 110 111# Should be disjoint with GCC_BOARDS 112LLVM_BOARDS = ['chell'] 113 114# FIXME(tcwang): only used for testing Async AFDO generation builders. 115# Remove this after testing is done. 116# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated 117# by samus is not suitable for production in both master and branch. 118# So it's suitable to test generation profiles but not actually use it. 119LLVM_BOARDS_ASYNC = ['samus'] 120 121 122class telemetry_AFDOGenerate(test.test): 123 """ 124 Run one or more telemetry benchmarks under the "perf" monitoring 125 tool, generate a "perf.data" file and upload to GS for comsumption 126 by the AFDO optimized build. 127 """ 128 version = 1 129 130 def scp_perf_data(self, dut, host_dir): 131 """Copy perf data from dut. 132 133 @param dut: The autotest host object representing DUT. 134 @param host_dir: The directory on host to put the file . 135 136 @returns status code for scp command. 137 """ 138 cmd = [] 139 src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR, 140 'perf.data')) 141 cmd.extend(['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v', src, host_dir]) 142 command = ' '.join(cmd) 143 144 logging.debug('Retrieving Perf Data: %s', command) 145 try: 146 result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS) 147 exit_code = result.exit_status 148 except Exception as e: 149 logging.error('Failed to retrieve results: %s', e) 150 raise 151 152 logging.debug('command return value: %d', exit_code) 153 return exit_code 154 155 @contextmanager 156 def perf_on_dut(self): 157 """Start and kill perf process on DUT. 158 """ 159 logging.info('Starting perf process in background.') 160 perf_cmd = 'nohup perf %s -o %s/perf.data' \ 161 % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR) 162 perf_pid = self._host.run_background(perf_cmd) 163 164 try: 165 # Use `kill -0` to check whether the perf process is alive 166 verify_cmd = 'kill -0 %s' % perf_pid 167 if self._host.run(verify_cmd, ignore_status=True).exit_status != 0: 168 logging.error('Perf process not started correctly on DUT') 169 raise RuntimeError 170 logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd) 171 yield 172 finally: 173 # Check if process is still alive after benchmark run, if yes, 174 # then kill it with -2 (which is SIGINT). 175 kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid 176 if self._host.run(kill_cmd, ignore_status=True).exit_status != 0: 177 logging.error('Perf process is not killed correctly on DUT.') 178 raise RuntimeError 179 # Perf process may not be terminated right after the kill command, 180 # wait until perf process finishes. 181 status = _wait_for_process(self._host, int(perf_pid), 182 WAIT_FOR_CMD_TIMEOUT_SECS) 183 if status != 0: 184 logging.error('Error waiting for perf process to be killed.') 185 raise RuntimeError 186 logging.info('Perf has been killed on DUT.') 187 188 status = self.scp_perf_data(self._host, self.profdir) 189 if status != 0: 190 logging.error('Cannot copy perf.data file to host.') 191 raise RuntimeError 192 193 def run_once(self, host, args): 194 """Run a set of telemetry benchmarks. 195 196 @param host: Host machine where test is run 197 @param args: A dictionary of the arguments that were passed 198 to this test. 199 @returns None. 200 """ 201 self._host = host 202 host_board = host.get_board().split(':')[1] 203 204 if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS 205 or host_board in LLVM_BOARDS_ASYNC): 206 raise error.TestFail( 207 'This test cannot be run on board %s' % host_board) 208 209 self._parse_args(args) 210 211 # Remove write protection on host, as now telemetry code will 212 # try to remove write protection that causes the machine to 213 # reboot and remount during run_benchmark. We want to avoid it. 214 filesystem_util.make_rootfs_writable(self._host) 215 216 with self.perf_on_dut(): 217 if self._minimal_telemetry: 218 self._run_tests_minimal_telemetry() 219 else: 220 self._telemetry_runner = telemetry_runner.TelemetryRunner( 221 self._host, self._local, telemetry_on_dut=False) 222 223 for benchmark_info in TELEMETRY_AFDO_BENCHMARKS: 224 benchmark = benchmark_info[0] 225 args = ( 226 ) if len(benchmark_info) == 1 else benchmark_info[1] 227 try: 228 self._run_test_with_retry(benchmark, *args) 229 except error.TestBaseException: 230 if not self._ignore_failures: 231 raise 232 logging.info('Ignoring failure from benchmark %s.', 233 benchmark) 234 235 def after_run_once(self): 236 """After the profile information has been collected, compress it 237 and upload it to GS 238 """ 239 PERF_FILE = 'perf.data' 240 COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data' 241 perf_data = os.path.join(self.profdir, PERF_FILE) 242 comp_data = os.path.join(self.profdir, 243 COMP_PERF_FILE % (self._arch, self._version)) 244 compressed = self._compress_file(perf_data, comp_data) 245 self._gs_upload(compressed, os.path.basename(compressed)) 246 247 # Also create copy of this file using "LATEST" as version so 248 # it can be found in case the builder is looking for a version 249 # number that does not match. It is ok to use a slighly old 250 # version of the this file for the optimized build 251 latest_data = COMP_PERF_FILE % (self._arch, 'LATEST') 252 latest_compressed = self._get_compressed_name(latest_data) 253 self._gs_upload(compressed, latest_compressed) 254 255 # So that they are not uploaded along with the logs. 256 os.remove(compressed) 257 os.remove(perf_data) 258 259 def _parse_args(self, args): 260 """Parses input arguments to this autotest. 261 262 @param args: Options->values dictionary. 263 @raises error.TestFail if a bad option is passed. 264 """ 265 266 # Set default values for the options. 267 # Architecture for which we are collecting afdo data. 268 self._arch = 'amd64' 269 # Use an alternate GS location where everyone can write. 270 # Set default depending on whether this is executing in 271 # the lab environment or not 272 self._gs_test_location = not utils.host_is_in_lab_zone( 273 self._host.hostname) 274 # Ignore individual test failures. 275 self._ignore_failures = False 276 # Use local copy of telemetry instead of using the dev server copy. 277 self._local = False 278 # Chrome version to which the AFDO data corresponds. 279 self._version, _ = self._host.get_chrome_version() 280 # Try to use the minimal support from Telemetry. The Telemetry 281 # benchmarks in ChromeOS are too flaky at this point. So, initially, 282 # this will be set to True by default. 283 self._minimal_telemetry = False 284 285 for option_name, value in args.iteritems(): 286 if option_name == 'arch': 287 self._arch = value 288 elif option_name == 'gs_test_location': 289 self._gs_test_location = (value == 'True') 290 elif option_name == 'ignore_failures': 291 self._ignore_failures = (value == 'True') 292 elif option_name == 'local': 293 self._local = (value == 'True') 294 elif option_name == 'minimal_telemetry': 295 self._minimal_telemetry = (value == 'True') 296 elif option_name == 'version': 297 self._version = value 298 else: 299 raise error.TestFail('Unknown option passed: %s' % option_name) 300 301 def _run_test(self, benchmark, *args): 302 """Run the benchmark using Telemetry. 303 304 @param benchmark: Name of the benchmark to run. 305 @param args: Additional arguments to pass to the telemetry execution 306 script. 307 @raises Raises error.TestFail if execution of test failed. 308 Also re-raise any exceptions thrown by run_telemetry benchmark. 309 """ 310 try: 311 logging.info('Starting run for Telemetry benchmark %s', benchmark) 312 start_time = time.time() 313 result = self._telemetry_runner.run_telemetry_benchmark( 314 benchmark, None, *args) 315 end_time = time.time() 316 logging.info('Completed Telemetry benchmark %s in %f seconds', 317 benchmark, end_time - start_time) 318 except error.TestBaseException as e: 319 end_time = time.time() 320 logging.info( 321 'Got exception from Telemetry benchmark %s ' 322 'after %f seconds. Exception: %s', benchmark, 323 end_time - start_time, str(e)) 324 raise 325 326 # We dont generate any keyvals for this run. This is not 327 # an official run of the benchmark. We are just running it to get 328 # a profile from it. 329 330 if result.status is telemetry_runner.SUCCESS_STATUS: 331 logging.info('Benchmark %s succeeded', benchmark) 332 else: 333 raise error.TestFail('An error occurred while executing' 334 ' benchmark: %s' % benchmark) 335 336 def _run_test_with_retry(self, benchmark, *args): 337 """Run the benchmark using Telemetry. Retry in case of failure. 338 339 @param benchmark: Name of the benchmark to run. 340 @param args: Additional arguments to pass to the telemetry execution 341 script. 342 @raises Re-raise any exceptions thrown by _run_test. 343 """ 344 345 tried = False 346 while True: 347 try: 348 self._run_test(benchmark, *args) 349 logging.info('Benchmark %s succeeded on %s try', benchmark, 350 'first' if not tried else 'second') 351 break 352 except error.TestBaseException: 353 if not tried: 354 tried = True 355 logging.info('Benchmark %s failed. Retrying ...', 356 benchmark) 357 else: 358 logging.info('Benchmark %s failed twice. Not retrying', 359 benchmark) 360 raise 361 362 def _run_tests_minimal_telemetry(self): 363 """Run the benchmarks using the minimal support from Telemetry. 364 365 The benchmarks are run using a client side autotest test. This test 366 will control Chrome directly using the chrome.Chrome support and it 367 will ask Chrome to display the benchmark pages directly instead of 368 using the "page sets" and "measurements" support from Telemetry. 369 In this way we avoid using Telemetry benchmark support which is not 370 stable on ChromeOS yet. 371 """ 372 AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient' 373 374 # Execute the client side test. 375 client_at = autotest.Autotest(self._host) 376 client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='') 377 378 @staticmethod 379 def _get_compressed_name(name): 380 """Given a file name, return bz2 compressed name. 381 @param name: Name of uncompressed file. 382 @returns name of compressed file. 383 """ 384 return name + '.bz2' 385 386 @staticmethod 387 def _compress_file(unc_file, com_file): 388 """Compresses specified file with bz2. 389 390 @param unc_file: name of file to compress. 391 @param com_file: prefix name of compressed file. 392 @raises error.TestFail if compression failed 393 @returns Name of compressed file. 394 """ 395 dest = '' 396 with open(unc_file, 'r') as inp: 397 dest = telemetry_AFDOGenerate._get_compressed_name(com_file) 398 with bz2.BZ2File(dest, 'w') as out: 399 for data in inp: 400 out.write(data) 401 if not dest or not os.path.isfile(dest): 402 raise error.TestFail('Could not compress %s' % unc_file) 403 return dest 404 405 def _gs_upload(self, local_file, remote_basename): 406 """Uploads file to google storage specific location. 407 408 @param local_file: name of file to upload. 409 @param remote_basename: basename of remote file. 410 @raises error.TestFail if upload failed. 411 @returns nothing. 412 """ 413 GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s' 414 GS_LLVM_DEST = 'gs://chromeos-prebuilt/afdo-job/llvm/%s' 415 GS_LLVM_ASYNC_DEST = \ 416 'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s' 417 GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s' 418 GS_ACL = 'project-private' 419 420 board = self._host.get_board().split(':')[1] 421 422 if self._gs_test_location: 423 gs_dest = GS_TEST_DEST 424 elif board in GCC_BOARDS: 425 gs_dest = GS_GCC_DEST 426 elif board in LLVM_BOARDS: 427 gs_dest = GS_LLVM_DEST 428 elif board in LLVM_BOARDS_ASYNC: 429 gs_dest = GS_LLVM_ASYNC_DEST 430 GS_ACL = 'public-read' 431 else: 432 raise error.TestFail('This test cannot be run on board %s' % board) 433 434 remote_file = gs_dest % remote_basename 435 436 logging.info('About to upload to GS: %s', remote_file) 437 if not utils.gs_upload( 438 local_file, remote_file, GS_ACL, result_dir=self.resultsdir): 439 logging.info('Failed upload to GS: %s', remote_file) 440 raise error.TestFail( 441 'Unable to gs upload %s to %s' % (local_file, remote_file)) 442 443 logging.info('Successfull upload to GS: %s', remote_file) 444