• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright 2020 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Fetches and submits the latest test-cases from Lexan's crash bucket."""
8
9import argparse
10import contextlib
11import datetime
12import json
13import logging
14import os
15import shutil
16import subprocess
17import sys
18import tempfile
19from typing import Generator, List, Iterable
20
21gsurl_base = 'gs://chrome-clang-crash-reports/v1'
22
23
24def gsutil_ls(loc: str) -> List[str]:
25  results = subprocess.run(['gsutil.py', 'ls', loc],
26                           stdout=subprocess.PIPE,
27                           check=True,
28                           encoding='utf-8')
29  return [l.strip() for l in results.stdout.splitlines()]
30
31
32def gsurl_ls_last_numbers(url: str) -> List[int]:
33  return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url))
34
35
36def get_available_year_numbers() -> List[int]:
37  return gsurl_ls_last_numbers(gsurl_base)
38
39
40def get_available_month_numbers(year: int) -> List[int]:
41  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}')
42
43
44def get_available_day_numbers(year: int, month: int) -> List[int]:
45  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}')
46
47
48def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]:
49  return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}')
50
51
52def test_cases_on_or_after(date: datetime.datetime
53                          ) -> Generator[str, None, None]:
54  """Yields all test-cases submitted on or after the given date."""
55  for year in get_available_year_numbers():
56    if year < date.year:
57      continue
58
59    for month in get_available_month_numbers(year):
60      if year == date.year and month < date.month:
61        continue
62
63      for day in get_available_day_numbers(year, month):
64        when = datetime.date(year, month, day)
65        if when < date:
66          continue
67
68        yield when, get_available_test_case_urls(year, month, day)
69
70
71def to_ymd(date: datetime.date) -> str:
72  return date.strftime('%Y-%m-%d')
73
74
75def from_ymd(date_str: str) -> datetime.date:
76  return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
77
78
79def persist_state(seen_urls: Iterable[str], state_file: str,
80                  current_date: datetime.date):
81  tmp_state_file = state_file + '.tmp'
82  with open(tmp_state_file, 'w', encoding='utf-8') as f:
83    json.dump(
84        {
85            'already_seen': sorted(seen_urls),
86            'most_recent_date': to_ymd(current_date),
87        },
88        f,
89    )
90  os.rename(tmp_state_file, state_file)
91
92
93@contextlib.contextmanager
94def temp_dir() -> Generator[str, None, None]:
95  loc = tempfile.mkdtemp('lexan-autosubmit')
96  try:
97    yield loc
98  finally:
99    shutil.rmtree(loc)
100
101
102def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None:
103  suffix = os.path.splitext(gs_url)[1]
104  target_name = 'test_case' + suffix
105  target = os.path.join(tempdir, target_name)
106  subprocess.run(['gsutil.py', 'cp', gs_url, target], check=True)
107  subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir)
108  os.unlink(target)
109
110
111def submit_test_case(gs_url: str, cr_tool: str) -> None:
112  logging.info('Submitting %s', gs_url)
113  with temp_dir() as tempdir:
114    download_and_unpack_test_case(gs_url, tempdir)
115
116    # Sometimes (e.g., in
117    # gs://chrome-clang-crash-reports/v1/2020/03/27/
118    # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
119    # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
120    repro_files = [
121        os.path.join(tempdir, x)
122        for x in os.listdir(tempdir)
123        if not x.endswith('.crash')
124    ]
125    assert len(repro_files) == 2, repro_files
126    if repro_files[0].endswith('.sh'):
127      sh_file, src_file = repro_files
128      assert not src_file.endswith('.sh'), repro_files
129    else:
130      src_file, sh_file = repro_files
131      assert sh_file.endswith('.sh'), repro_files
132
133    # Peephole: lexan got a crash upload with a way old clang. Ignore it.
134    with open(sh_file, encoding='utf-8') as f:
135      if 'Crash reproducer for clang version 9.0.0' in f.read():
136        logging.warning('Skipping upload for %s; seems to be with an old clang',
137                        gs_url)
138        return
139
140    subprocess.run(
141        [
142            cr_tool,
143            'reduce',
144            '-stream=false',
145            '-wait=false',
146            '-note',
147            gs_url,
148            '-sh_file',
149            os.path.join(tempdir, sh_file),
150            '-src_file',
151            os.path.join(tempdir, src_file),
152        ],
153        check=True,
154    )
155
156
157def submit_new_test_cases(
158    last_seen_test_cases: Iterable[str],
159    earliest_date_to_check: datetime.date,
160    forcey: str,
161    state_file_path: str,
162) -> None:
163  """Submits new test-cases to forcey.
164
165  This will persist state after each test-case is submitted.
166
167  Args:
168    last_seen_test_cases: test-cases which have been submitted already, and
169      should be skipped if seen again.
170    earliest_date_to_check: the earliest date we should consider test-cases
171      from.
172    forcey: path to the forcey binary.
173    state_file_path: path to our state file.
174  """
175  # `all_test_cases_seen` is the union of all test-cases seen on this and prior
176  # invocations. It guarantees, in all cases we care about, that we won't
177  # submit the same test-case twice. `test_cases_seen_this_invocation` is
178  # persisted as "all of the test-cases we've seen on this and prior
179  # invocations" if we successfully submit _all_ test-cases.
180  #
181  # Since you can visualize the test-cases this script considers as a sliding
182  # window that only moves forward, if we saw a test-case on a prior iteration
183  # but no longer see it, we'll never see it again (since it fell out of our
184  # sliding window by being too old). Hence, keeping it around is
185  # pointless.
186  #
187  # We only persist this minimized set of test-cases if _everything_ succeeds,
188  # since if something fails below, there's a chance that we haven't revisited
189  # test-cases that we've already seen.
190  all_test_cases_seen = set(last_seen_test_cases)
191  test_cases_seen_this_invocation = []
192  most_recent_date = earliest_date_to_check
193  for date, candidates in test_cases_on_or_after(earliest_date_to_check):
194    most_recent_date = max(most_recent_date, date)
195
196    for url in candidates:
197      test_cases_seen_this_invocation.append(url)
198      if url in all_test_cases_seen:
199        continue
200
201      all_test_cases_seen.add(url)
202      submit_test_case(url, forcey)
203
204      # Persisting on each iteration of this loop isn't free, but it's the
205      # easiest way to not resubmit test-cases, and it's good to keep in mind
206      # that:
207      # - the state file will be small (<12KB, since it only keeps a few days
208      #   worth of test-cases after the first run)
209      # - in addition to this, we're downloading+unzipping+reuploading multiple
210      #   MB of test-case bytes.
211      #
212      # So comparatively, the overhead here probably isn't an issue.
213      persist_state(all_test_cases_seen, state_file_path, most_recent_date)
214
215  persist_state(test_cases_seen_this_invocation, state_file_path,
216                most_recent_date)
217
218
219def main(argv: List[str]):
220  logging.basicConfig(
221      format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: '
222      '%(message)s',
223      level=logging.INFO,
224  )
225
226  my_dir = os.path.dirname(os.path.abspath(__file__))
227
228  parser = argparse.ArgumentParser(description=__doc__)
229  parser.add_argument(
230      '--state_file', default=os.path.join(my_dir, 'lexan-state.json'))
231  parser.add_argument(
232      '--last_date',
233      help='The earliest date that we care about. All test cases from here '
234      'on will be picked up. Format is YYYY-MM-DD.')
235  parser.add_argument(
236      '--4c', dest='forcey', required=True, help='Path to a 4c client binary')
237  opts = parser.parse_args(argv)
238
239  forcey = opts.forcey
240  state_file = opts.state_file
241  last_date_str = opts.last_date
242
243  os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True)
244
245  if last_date_str is None:
246    with open(state_file, encoding='utf-8') as f:
247      data = json.load(f)
248    most_recent_date = from_ymd(data['most_recent_date'])
249    submit_new_test_cases(
250        last_seen_test_cases=data['already_seen'],
251        # Note that we always subtract one day from this to avoid a race:
252        # uploads may appear slightly out-of-order (or builders may lag, or
253        # ...), so the last test-case uploaded for 2020/01/01 might appear
254        # _after_ the first test-case for 2020/01/02. Assuming that builders
255        # won't lag behind for over a day, the easiest way to handle this is to
256        # always check the previous and current days.
257        earliest_date_to_check=most_recent_date - datetime.timedelta(days=1),
258        forcey=forcey,
259        state_file_path=state_file,
260    )
261  else:
262    submit_new_test_cases(
263        last_seen_test_cases=(),
264        earliest_date_to_check=from_ymd(last_date_str),
265        forcey=forcey,
266        state_file_path=state_file,
267    )
268
269
270if __name__ == '__main__':
271  sys.exit(main(sys.argv[1:]))
272