• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright 2020 The ChromiumOS Authors
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Fetches and submits the latest test-cases from Lexan's crash bucket."""
8
9import argparse
10import contextlib
11import datetime
12import json
13import logging
14import os
15import shutil
16import subprocess
17import sys
18import tempfile
19from typing import Generator, Iterable, List
20
21
22gsurl_base = "gs://chrome-clang-crash-reports/v1"
23
24
25def gsutil_ls(loc: str) -> List[str]:
26    results = subprocess.run(
27        ["gsutil.py", "ls", loc],
28        stdout=subprocess.PIPE,
29        check=True,
30        encoding="utf-8",
31    )
32    return [l.strip() for l in results.stdout.splitlines()]
33
34
35def gsurl_ls_last_numbers(url: str) -> List[int]:
36    return sorted(int(x.rstrip("/").split("/")[-1]) for x in gsutil_ls(url))
37
38
39def get_available_year_numbers() -> List[int]:
40    return gsurl_ls_last_numbers(gsurl_base)
41
42
43def get_available_month_numbers(year: int) -> List[int]:
44    return gsurl_ls_last_numbers(f"{gsurl_base}/{year}")
45
46
47def get_available_day_numbers(year: int, month: int) -> List[int]:
48    return gsurl_ls_last_numbers(f"{gsurl_base}/{year}/{month:02d}")
49
50
51def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]:
52    return gsutil_ls(f"{gsurl_base}/{year}/{month:02d}/{day:02d}")
53
54
55def test_cases_on_or_after(
56    date: datetime.datetime,
57) -> Generator[str, None, None]:
58    """Yields all test-cases submitted on or after the given date."""
59    for year in get_available_year_numbers():
60        if year < date.year:
61            continue
62
63        for month in get_available_month_numbers(year):
64            if year == date.year and month < date.month:
65                continue
66
67            for day in get_available_day_numbers(year, month):
68                when = datetime.date(year, month, day)
69                if when < date:
70                    continue
71
72                yield when, get_available_test_case_urls(year, month, day)
73
74
75def to_ymd(date: datetime.date) -> str:
76    return date.strftime("%Y-%m-%d")
77
78
79def from_ymd(date_str: str) -> datetime.date:
80    return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
81
82
83def persist_state(
84    seen_urls: Iterable[str], state_file: str, current_date: datetime.date
85):
86    tmp_state_file = state_file + ".tmp"
87    with open(tmp_state_file, "w", encoding="utf-8") as f:
88        json.dump(
89            {
90                "already_seen": sorted(seen_urls),
91                "most_recent_date": to_ymd(current_date),
92            },
93            f,
94        )
95    os.rename(tmp_state_file, state_file)
96
97
98@contextlib.contextmanager
99def temp_dir() -> Generator[str, None, None]:
100    loc = tempfile.mkdtemp("lexan-autosubmit")
101    try:
102        yield loc
103    finally:
104        shutil.rmtree(loc)
105
106
107def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None:
108    suffix = os.path.splitext(gs_url)[1]
109    target_name = "test_case" + suffix
110    target = os.path.join(tempdir, target_name)
111    subprocess.run(["gsutil.py", "cp", gs_url, target], check=True)
112    subprocess.run(["tar", "xaf", target_name], check=True, cwd=tempdir)
113    os.unlink(target)
114
115
116def submit_test_case(gs_url: str, cr_tool: str) -> None:
117    logging.info("Submitting %s", gs_url)
118    with temp_dir() as tempdir:
119        download_and_unpack_test_case(gs_url, tempdir)
120
121        # Sometimes (e.g., in
122        # gs://chrome-clang-crash-reports/v1/2020/03/27/
123        # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
124        # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
125        repro_files = [
126            os.path.join(tempdir, x)
127            for x in os.listdir(tempdir)
128            if not x.endswith(".crash")
129        ]
130        assert len(repro_files) == 2, repro_files
131        if repro_files[0].endswith(".sh"):
132            sh_file, src_file = repro_files
133            assert not src_file.endswith(".sh"), repro_files
134        else:
135            src_file, sh_file = repro_files
136            assert sh_file.endswith(".sh"), repro_files
137
138        # Peephole: lexan got a crash upload with a way old clang. Ignore it.
139        with open(sh_file, encoding="utf-8") as f:
140            if "Crash reproducer for clang version 9.0.0" in f.read():
141                logging.warning(
142                    "Skipping upload for %s; seems to be with an old clang",
143                    gs_url,
144                )
145                return
146
147        subprocess.run(
148            [
149                cr_tool,
150                "reduce",
151                "-stream=false",
152                "-wait=false",
153                "-note",
154                gs_url,
155                "-sh_file",
156                os.path.join(tempdir, sh_file),
157                "-src_file",
158                os.path.join(tempdir, src_file),
159            ],
160            check=True,
161        )
162
163
164def submit_new_test_cases(
165    last_seen_test_cases: Iterable[str],
166    earliest_date_to_check: datetime.date,
167    forcey: str,
168    state_file_path: str,
169) -> None:
170    """Submits new test-cases to forcey.
171
172    This will persist state after each test-case is submitted.
173
174    Args:
175      last_seen_test_cases: test-cases which have been submitted already, and
176        should be skipped if seen again.
177      earliest_date_to_check: the earliest date we should consider test-cases
178        from.
179      forcey: path to the forcey binary.
180      state_file_path: path to our state file.
181    """
182    # `all_test_cases_seen` is the union of all test-cases seen on this and prior
183    # invocations. It guarantees, in all cases we care about, that we won't
184    # submit the same test-case twice. `test_cases_seen_this_invocation` is
185    # persisted as "all of the test-cases we've seen on this and prior
186    # invocations" if we successfully submit _all_ test-cases.
187    #
188    # Since you can visualize the test-cases this script considers as a sliding
189    # window that only moves forward, if we saw a test-case on a prior iteration
190    # but no longer see it, we'll never see it again (since it fell out of our
191    # sliding window by being too old). Hence, keeping it around is
192    # pointless.
193    #
194    # We only persist this minimized set of test-cases if _everything_ succeeds,
195    # since if something fails below, there's a chance that we haven't revisited
196    # test-cases that we've already seen.
197    all_test_cases_seen = set(last_seen_test_cases)
198    test_cases_seen_this_invocation = []
199    most_recent_date = earliest_date_to_check
200    for date, candidates in test_cases_on_or_after(earliest_date_to_check):
201        most_recent_date = max(most_recent_date, date)
202
203        for url in candidates:
204            test_cases_seen_this_invocation.append(url)
205            if url in all_test_cases_seen:
206                continue
207
208            all_test_cases_seen.add(url)
209            submit_test_case(url, forcey)
210
211            # Persisting on each iteration of this loop isn't free, but it's the
212            # easiest way to not resubmit test-cases, and it's good to keep in mind
213            # that:
214            # - the state file will be small (<12KB, since it only keeps a few days
215            #   worth of test-cases after the first run)
216            # - in addition to this, we're downloading+unzipping+reuploading multiple
217            #   MB of test-case bytes.
218            #
219            # So comparatively, the overhead here probably isn't an issue.
220            persist_state(
221                all_test_cases_seen, state_file_path, most_recent_date
222            )
223
224    persist_state(
225        test_cases_seen_this_invocation, state_file_path, most_recent_date
226    )
227
228
229def main(argv: List[str]):
230    logging.basicConfig(
231        format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
232        "%(message)s",
233        level=logging.INFO,
234    )
235
236    my_dir = os.path.dirname(os.path.abspath(__file__))
237
238    parser = argparse.ArgumentParser(description=__doc__)
239    parser.add_argument(
240        "--state_file", default=os.path.join(my_dir, "lexan-state.json")
241    )
242    parser.add_argument(
243        "--last_date",
244        help="The earliest date that we care about. All test cases from here "
245        "on will be picked up. Format is YYYY-MM-DD.",
246    )
247    parser.add_argument(
248        "--4c", dest="forcey", required=True, help="Path to a 4c client binary"
249    )
250    opts = parser.parse_args(argv)
251
252    forcey = opts.forcey
253    state_file = opts.state_file
254    last_date_str = opts.last_date
255
256    os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True)
257
258    if last_date_str is None:
259        with open(state_file, encoding="utf-8") as f:
260            data = json.load(f)
261        most_recent_date = from_ymd(data["most_recent_date"])
262        submit_new_test_cases(
263            last_seen_test_cases=data["already_seen"],
264            # Note that we always subtract one day from this to avoid a race:
265            # uploads may appear slightly out-of-order (or builders may lag, or
266            # ...), so the last test-case uploaded for 2020/01/01 might appear
267            # _after_ the first test-case for 2020/01/02. Assuming that builders
268            # won't lag behind for over a day, the easiest way to handle this is to
269            # always check the previous and current days.
270            earliest_date_to_check=most_recent_date
271            - datetime.timedelta(days=1),
272            forcey=forcey,
273            state_file_path=state_file,
274        )
275    else:
276        submit_new_test_cases(
277            last_seen_test_cases=(),
278            earliest_date_to_check=from_ymd(last_date_str),
279            forcey=forcey,
280            state_file_path=state_file,
281        )
282
283
284if __name__ == "__main__":
285    sys.exit(main(sys.argv[1:]))
286