1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# Copyright 2020 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Fetches and submits the latest test-cases from Lexan's crash bucket.""" 8 9import argparse 10import contextlib 11import datetime 12import json 13import logging 14import os 15import shutil 16import subprocess 17import sys 18import tempfile 19from typing import Generator, List, Iterable 20 21gsurl_base = 'gs://chrome-clang-crash-reports/v1' 22 23 24def gsutil_ls(loc: str) -> List[str]: 25 results = subprocess.run(['gsutil.py', 'ls', loc], 26 stdout=subprocess.PIPE, 27 check=True, 28 encoding='utf-8') 29 return [l.strip() for l in results.stdout.splitlines()] 30 31 32def gsurl_ls_last_numbers(url: str) -> List[int]: 33 return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url)) 34 35 36def get_available_year_numbers() -> List[int]: 37 return gsurl_ls_last_numbers(gsurl_base) 38 39 40def get_available_month_numbers(year: int) -> List[int]: 41 return gsurl_ls_last_numbers(f'{gsurl_base}/{year}') 42 43 44def get_available_day_numbers(year: int, month: int) -> List[int]: 45 return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}') 46 47 48def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]: 49 return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}') 50 51 52def test_cases_on_or_after(date: datetime.datetime 53 ) -> Generator[str, None, None]: 54 """Yields all test-cases submitted on or after the given date.""" 55 for year in get_available_year_numbers(): 56 if year < date.year: 57 continue 58 59 for month in get_available_month_numbers(year): 60 if year == date.year and month < date.month: 61 continue 62 63 for day in get_available_day_numbers(year, month): 64 when = datetime.date(year, month, day) 65 if when < date: 66 continue 67 68 yield when, get_available_test_case_urls(year, month, day) 69 70 71def to_ymd(date: datetime.date) -> str: 72 return date.strftime('%Y-%m-%d') 73 74 75def from_ymd(date_str: str) -> datetime.date: 76 return datetime.datetime.strptime(date_str, '%Y-%m-%d').date() 77 78 79def persist_state(seen_urls: Iterable[str], state_file: str, 80 current_date: datetime.date): 81 tmp_state_file = state_file + '.tmp' 82 with open(tmp_state_file, 'w', encoding='utf-8') as f: 83 json.dump( 84 { 85 'already_seen': sorted(seen_urls), 86 'most_recent_date': to_ymd(current_date), 87 }, 88 f, 89 ) 90 os.rename(tmp_state_file, state_file) 91 92 93@contextlib.contextmanager 94def temp_dir() -> Generator[str, None, None]: 95 loc = tempfile.mkdtemp('lexan-autosubmit') 96 try: 97 yield loc 98 finally: 99 shutil.rmtree(loc) 100 101 102def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None: 103 suffix = os.path.splitext(gs_url)[1] 104 target_name = 'test_case' + suffix 105 target = os.path.join(tempdir, target_name) 106 subprocess.run(['gsutil.py', 'cp', gs_url, target], check=True) 107 subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir) 108 os.unlink(target) 109 110 111def submit_test_case(gs_url: str, cr_tool: str) -> None: 112 logging.info('Submitting %s', gs_url) 113 with temp_dir() as tempdir: 114 download_and_unpack_test_case(gs_url, tempdir) 115 116 # Sometimes (e.g., in 117 # gs://chrome-clang-crash-reports/v1/2020/03/27/ 118 # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz) 119 # we'll get `.crash` files. Unclear why, but let's filter them out anyway. 120 repro_files = [ 121 os.path.join(tempdir, x) 122 for x in os.listdir(tempdir) 123 if not x.endswith('.crash') 124 ] 125 assert len(repro_files) == 2, repro_files 126 if repro_files[0].endswith('.sh'): 127 sh_file, src_file = repro_files 128 assert not src_file.endswith('.sh'), repro_files 129 else: 130 src_file, sh_file = repro_files 131 assert sh_file.endswith('.sh'), repro_files 132 133 # Peephole: lexan got a crash upload with a way old clang. Ignore it. 134 with open(sh_file, encoding='utf-8') as f: 135 if 'Crash reproducer for clang version 9.0.0' in f.read(): 136 logging.warning('Skipping upload for %s; seems to be with an old clang', 137 gs_url) 138 return 139 140 subprocess.run( 141 [ 142 cr_tool, 143 'reduce', 144 '-stream=false', 145 '-wait=false', 146 '-note', 147 gs_url, 148 '-sh_file', 149 os.path.join(tempdir, sh_file), 150 '-src_file', 151 os.path.join(tempdir, src_file), 152 ], 153 check=True, 154 ) 155 156 157def submit_new_test_cases( 158 last_seen_test_cases: Iterable[str], 159 earliest_date_to_check: datetime.date, 160 forcey: str, 161 state_file_path: str, 162) -> None: 163 """Submits new test-cases to forcey. 164 165 This will persist state after each test-case is submitted. 166 167 Args: 168 last_seen_test_cases: test-cases which have been submitted already, and 169 should be skipped if seen again. 170 earliest_date_to_check: the earliest date we should consider test-cases 171 from. 172 forcey: path to the forcey binary. 173 state_file_path: path to our state file. 174 """ 175 # `all_test_cases_seen` is the union of all test-cases seen on this and prior 176 # invocations. It guarantees, in all cases we care about, that we won't 177 # submit the same test-case twice. `test_cases_seen_this_invocation` is 178 # persisted as "all of the test-cases we've seen on this and prior 179 # invocations" if we successfully submit _all_ test-cases. 180 # 181 # Since you can visualize the test-cases this script considers as a sliding 182 # window that only moves forward, if we saw a test-case on a prior iteration 183 # but no longer see it, we'll never see it again (since it fell out of our 184 # sliding window by being too old). Hence, keeping it around is 185 # pointless. 186 # 187 # We only persist this minimized set of test-cases if _everything_ succeeds, 188 # since if something fails below, there's a chance that we haven't revisited 189 # test-cases that we've already seen. 190 all_test_cases_seen = set(last_seen_test_cases) 191 test_cases_seen_this_invocation = [] 192 most_recent_date = earliest_date_to_check 193 for date, candidates in test_cases_on_or_after(earliest_date_to_check): 194 most_recent_date = max(most_recent_date, date) 195 196 for url in candidates: 197 test_cases_seen_this_invocation.append(url) 198 if url in all_test_cases_seen: 199 continue 200 201 all_test_cases_seen.add(url) 202 submit_test_case(url, forcey) 203 204 # Persisting on each iteration of this loop isn't free, but it's the 205 # easiest way to not resubmit test-cases, and it's good to keep in mind 206 # that: 207 # - the state file will be small (<12KB, since it only keeps a few days 208 # worth of test-cases after the first run) 209 # - in addition to this, we're downloading+unzipping+reuploading multiple 210 # MB of test-case bytes. 211 # 212 # So comparatively, the overhead here probably isn't an issue. 213 persist_state(all_test_cases_seen, state_file_path, most_recent_date) 214 215 persist_state(test_cases_seen_this_invocation, state_file_path, 216 most_recent_date) 217 218 219def main(argv: List[str]): 220 logging.basicConfig( 221 format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: ' 222 '%(message)s', 223 level=logging.INFO, 224 ) 225 226 my_dir = os.path.dirname(os.path.abspath(__file__)) 227 228 parser = argparse.ArgumentParser(description=__doc__) 229 parser.add_argument( 230 '--state_file', default=os.path.join(my_dir, 'lexan-state.json')) 231 parser.add_argument( 232 '--last_date', 233 help='The earliest date that we care about. All test cases from here ' 234 'on will be picked up. Format is YYYY-MM-DD.') 235 parser.add_argument( 236 '--4c', dest='forcey', required=True, help='Path to a 4c client binary') 237 opts = parser.parse_args(argv) 238 239 forcey = opts.forcey 240 state_file = opts.state_file 241 last_date_str = opts.last_date 242 243 os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True) 244 245 if last_date_str is None: 246 with open(state_file, encoding='utf-8') as f: 247 data = json.load(f) 248 most_recent_date = from_ymd(data['most_recent_date']) 249 submit_new_test_cases( 250 last_seen_test_cases=data['already_seen'], 251 # Note that we always subtract one day from this to avoid a race: 252 # uploads may appear slightly out-of-order (or builders may lag, or 253 # ...), so the last test-case uploaded for 2020/01/01 might appear 254 # _after_ the first test-case for 2020/01/02. Assuming that builders 255 # won't lag behind for over a day, the easiest way to handle this is to 256 # always check the previous and current days. 257 earliest_date_to_check=most_recent_date - datetime.timedelta(days=1), 258 forcey=forcey, 259 state_file_path=state_file, 260 ) 261 else: 262 submit_new_test_cases( 263 last_seen_test_cases=(), 264 earliest_date_to_check=from_ymd(last_date_str), 265 forcey=forcey, 266 state_file_path=state_file, 267 ) 268 269 270if __name__ == '__main__': 271 sys.exit(main(sys.argv[1:])) 272