1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# Copyright 2020 The ChromiumOS Authors 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Fetches and submits the latest test-cases from Lexan's crash bucket.""" 8 9import argparse 10import contextlib 11import datetime 12import json 13import logging 14import os 15import shutil 16import subprocess 17import sys 18import tempfile 19from typing import Generator, Iterable, List 20 21 22gsurl_base = "gs://chrome-clang-crash-reports/v1" 23 24 25def gsutil_ls(loc: str) -> List[str]: 26 results = subprocess.run( 27 ["gsutil.py", "ls", loc], 28 stdout=subprocess.PIPE, 29 check=True, 30 encoding="utf-8", 31 ) 32 return [l.strip() for l in results.stdout.splitlines()] 33 34 35def gsurl_ls_last_numbers(url: str) -> List[int]: 36 return sorted(int(x.rstrip("/").split("/")[-1]) for x in gsutil_ls(url)) 37 38 39def get_available_year_numbers() -> List[int]: 40 return gsurl_ls_last_numbers(gsurl_base) 41 42 43def get_available_month_numbers(year: int) -> List[int]: 44 return gsurl_ls_last_numbers(f"{gsurl_base}/{year}") 45 46 47def get_available_day_numbers(year: int, month: int) -> List[int]: 48 return gsurl_ls_last_numbers(f"{gsurl_base}/{year}/{month:02d}") 49 50 51def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]: 52 return gsutil_ls(f"{gsurl_base}/{year}/{month:02d}/{day:02d}") 53 54 55def test_cases_on_or_after( 56 date: datetime.datetime, 57) -> Generator[str, None, None]: 58 """Yields all test-cases submitted on or after the given date.""" 59 for year in get_available_year_numbers(): 60 if year < date.year: 61 continue 62 63 for month in get_available_month_numbers(year): 64 if year == date.year and month < date.month: 65 continue 66 67 for day in get_available_day_numbers(year, month): 68 when = datetime.date(year, month, day) 69 if when < date: 70 continue 71 72 yield when, get_available_test_case_urls(year, month, day) 73 74 75def to_ymd(date: datetime.date) -> str: 76 return date.strftime("%Y-%m-%d") 77 78 79def from_ymd(date_str: str) -> datetime.date: 80 return datetime.datetime.strptime(date_str, "%Y-%m-%d").date() 81 82 83def persist_state( 84 seen_urls: Iterable[str], state_file: str, current_date: datetime.date 85): 86 tmp_state_file = state_file + ".tmp" 87 with open(tmp_state_file, "w", encoding="utf-8") as f: 88 json.dump( 89 { 90 "already_seen": sorted(seen_urls), 91 "most_recent_date": to_ymd(current_date), 92 }, 93 f, 94 ) 95 os.rename(tmp_state_file, state_file) 96 97 98@contextlib.contextmanager 99def temp_dir() -> Generator[str, None, None]: 100 loc = tempfile.mkdtemp("lexan-autosubmit") 101 try: 102 yield loc 103 finally: 104 shutil.rmtree(loc) 105 106 107def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None: 108 suffix = os.path.splitext(gs_url)[1] 109 target_name = "test_case" + suffix 110 target = os.path.join(tempdir, target_name) 111 subprocess.run(["gsutil.py", "cp", gs_url, target], check=True) 112 subprocess.run(["tar", "xaf", target_name], check=True, cwd=tempdir) 113 os.unlink(target) 114 115 116def submit_test_case(gs_url: str, cr_tool: str) -> None: 117 logging.info("Submitting %s", gs_url) 118 with temp_dir() as tempdir: 119 download_and_unpack_test_case(gs_url, tempdir) 120 121 # Sometimes (e.g., in 122 # gs://chrome-clang-crash-reports/v1/2020/03/27/ 123 # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz) 124 # we'll get `.crash` files. Unclear why, but let's filter them out anyway. 125 repro_files = [ 126 os.path.join(tempdir, x) 127 for x in os.listdir(tempdir) 128 if not x.endswith(".crash") 129 ] 130 assert len(repro_files) == 2, repro_files 131 if repro_files[0].endswith(".sh"): 132 sh_file, src_file = repro_files 133 assert not src_file.endswith(".sh"), repro_files 134 else: 135 src_file, sh_file = repro_files 136 assert sh_file.endswith(".sh"), repro_files 137 138 # Peephole: lexan got a crash upload with a way old clang. Ignore it. 139 with open(sh_file, encoding="utf-8") as f: 140 if "Crash reproducer for clang version 9.0.0" in f.read(): 141 logging.warning( 142 "Skipping upload for %s; seems to be with an old clang", 143 gs_url, 144 ) 145 return 146 147 subprocess.run( 148 [ 149 cr_tool, 150 "reduce", 151 "-stream=false", 152 "-wait=false", 153 "-note", 154 gs_url, 155 "-sh_file", 156 os.path.join(tempdir, sh_file), 157 "-src_file", 158 os.path.join(tempdir, src_file), 159 ], 160 check=True, 161 ) 162 163 164def submit_new_test_cases( 165 last_seen_test_cases: Iterable[str], 166 earliest_date_to_check: datetime.date, 167 forcey: str, 168 state_file_path: str, 169) -> None: 170 """Submits new test-cases to forcey. 171 172 This will persist state after each test-case is submitted. 173 174 Args: 175 last_seen_test_cases: test-cases which have been submitted already, and 176 should be skipped if seen again. 177 earliest_date_to_check: the earliest date we should consider test-cases 178 from. 179 forcey: path to the forcey binary. 180 state_file_path: path to our state file. 181 """ 182 # `all_test_cases_seen` is the union of all test-cases seen on this and prior 183 # invocations. It guarantees, in all cases we care about, that we won't 184 # submit the same test-case twice. `test_cases_seen_this_invocation` is 185 # persisted as "all of the test-cases we've seen on this and prior 186 # invocations" if we successfully submit _all_ test-cases. 187 # 188 # Since you can visualize the test-cases this script considers as a sliding 189 # window that only moves forward, if we saw a test-case on a prior iteration 190 # but no longer see it, we'll never see it again (since it fell out of our 191 # sliding window by being too old). Hence, keeping it around is 192 # pointless. 193 # 194 # We only persist this minimized set of test-cases if _everything_ succeeds, 195 # since if something fails below, there's a chance that we haven't revisited 196 # test-cases that we've already seen. 197 all_test_cases_seen = set(last_seen_test_cases) 198 test_cases_seen_this_invocation = [] 199 most_recent_date = earliest_date_to_check 200 for date, candidates in test_cases_on_or_after(earliest_date_to_check): 201 most_recent_date = max(most_recent_date, date) 202 203 for url in candidates: 204 test_cases_seen_this_invocation.append(url) 205 if url in all_test_cases_seen: 206 continue 207 208 all_test_cases_seen.add(url) 209 submit_test_case(url, forcey) 210 211 # Persisting on each iteration of this loop isn't free, but it's the 212 # easiest way to not resubmit test-cases, and it's good to keep in mind 213 # that: 214 # - the state file will be small (<12KB, since it only keeps a few days 215 # worth of test-cases after the first run) 216 # - in addition to this, we're downloading+unzipping+reuploading multiple 217 # MB of test-case bytes. 218 # 219 # So comparatively, the overhead here probably isn't an issue. 220 persist_state( 221 all_test_cases_seen, state_file_path, most_recent_date 222 ) 223 224 persist_state( 225 test_cases_seen_this_invocation, state_file_path, most_recent_date 226 ) 227 228 229def main(argv: List[str]): 230 logging.basicConfig( 231 format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: " 232 "%(message)s", 233 level=logging.INFO, 234 ) 235 236 my_dir = os.path.dirname(os.path.abspath(__file__)) 237 238 parser = argparse.ArgumentParser(description=__doc__) 239 parser.add_argument( 240 "--state_file", default=os.path.join(my_dir, "lexan-state.json") 241 ) 242 parser.add_argument( 243 "--last_date", 244 help="The earliest date that we care about. All test cases from here " 245 "on will be picked up. Format is YYYY-MM-DD.", 246 ) 247 parser.add_argument( 248 "--4c", dest="forcey", required=True, help="Path to a 4c client binary" 249 ) 250 opts = parser.parse_args(argv) 251 252 forcey = opts.forcey 253 state_file = opts.state_file 254 last_date_str = opts.last_date 255 256 os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True) 257 258 if last_date_str is None: 259 with open(state_file, encoding="utf-8") as f: 260 data = json.load(f) 261 most_recent_date = from_ymd(data["most_recent_date"]) 262 submit_new_test_cases( 263 last_seen_test_cases=data["already_seen"], 264 # Note that we always subtract one day from this to avoid a race: 265 # uploads may appear slightly out-of-order (or builders may lag, or 266 # ...), so the last test-case uploaded for 2020/01/01 might appear 267 # _after_ the first test-case for 2020/01/02. Assuming that builders 268 # won't lag behind for over a day, the easiest way to handle this is to 269 # always check the previous and current days. 270 earliest_date_to_check=most_recent_date 271 - datetime.timedelta(days=1), 272 forcey=forcey, 273 state_file_path=state_file, 274 ) 275 else: 276 submit_new_test_cases( 277 last_seen_test_cases=(), 278 earliest_date_to_check=from_ymd(last_date_str), 279 forcey=forcey, 280 state_file_path=state_file, 281 ) 282 283 284if __name__ == "__main__": 285 sys.exit(main(sys.argv[1:])) 286