• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""Methods related to querying the ResultDB BigQuery tables."""
5
6import logging
7import time
8from typing import Collection, Dict, Generator, Iterable, List, Optional, Tuple
9
10from google.cloud import bigquery
11from google.cloud import bigquery_storage
12import pandas
13
14from typ import expectations_parser
15from typ import json_results
16from unexpected_passes_common import constants
17from unexpected_passes_common import data_types
18from unexpected_passes_common import expectations
19
20DEFAULT_NUM_SAMPLES = 100
21
22# Subquery for getting all try builds that were used for CL submission. 30 days
23# is chosen because the ResultDB tables we pull data from only keep data around
24# for 30 days.
25PARTITIONED_SUBMITTED_BUILDS_TEMPLATE = """\
26    SELECT
27      CONCAT("build-", CAST(unnested_builds.id AS STRING)) as id
28    FROM
29      `commit-queue.{project_view}.attempts`,
30      UNNEST(builds) as unnested_builds,
31      UNNEST(gerrit_changes) as unnested_changes
32    WHERE
33      unnested_builds.host = "cr-buildbucket.appspot.com"
34      AND unnested_changes.submit_status = "SUCCESS"
35      AND start_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(),
36                                     INTERVAL 30 DAY)"""
37
38QueryResult = pandas.Series
39
40
41class BigQueryQuerier:
42  """Class to handle all BigQuery queries for a script invocation."""
43
44  def __init__(self, suite: Optional[str], project: str, num_samples: int,
45               keep_unmatched_results: bool):
46    """
47    Args:
48      suite: A string containing the name of the suite that is being queried
49          for. Can be None if there is no differentiation between different
50          suites.
51      project: A string containing the billing project to use for BigQuery.
52      num_samples: An integer containing the number of builds to pull results
53          from.
54      keep_unmatched_results: Whether to store and return unmatched results
55          for debugging purposes.
56    """
57    self._suite = suite
58    self._project = project
59    self._num_samples = num_samples or DEFAULT_NUM_SAMPLES
60    self._keep_unmatched_results = keep_unmatched_results
61
62    assert self._num_samples > 0
63
64  def FillExpectationMapForBuilders(
65      self, expectation_map: data_types.TestExpectationMap,
66      builders: Collection[data_types.BuilderEntry]
67  ) -> Dict[str, data_types.ResultListType]:
68    """Fills |expectation_map| with results from |builders|.
69
70    Args:
71      expectation_map: A data_types.TestExpectationMap. Will be modified
72          in-place.
73      builders: An iterable of data_types.BuilderEntry containing the builders
74          to query.
75
76    Returns:
77      A dict containing any results that were retrieved that did not have a
78      matching expectation in |expectation_map| in the following format:
79      {
80        |builder_type|:|builder_name| (str): [
81          result1 (data_types.Result),
82          result2 (data_types.Result),
83          ...
84        ],
85      }
86    """
87    start_time = time.time()
88    logging.debug('Starting to fill expectation map for %d builders',
89                  len(builders))
90    assert isinstance(expectation_map, data_types.TestExpectationMap)
91    # Ensure that all the builders are of the same type since we make some
92    # assumptions about that later on.
93    assert builders
94    builder_type = None
95    for b in builders:
96      if builder_type is None:
97        builder_type = b.builder_type
98      else:
99        assert b.builder_type == builder_type
100
101    internal_statuses = set()
102    for b in builders:
103      internal_statuses.add(b.is_internal_builder)
104
105    matched_builders = set()
106    all_unmatched_results = {}
107    for internal in internal_statuses:
108      for builder_name, results, expectation_files in (
109          self.GetBuilderGroupedQueryResults(builder_type, internal)):
110        matching_builder = None
111        for b in builders:
112          if b.name == builder_name and b.is_internal_builder == internal:
113            matching_builder = b
114            break
115
116        if not matching_builder:
117          logging.warning(
118              'Did not find a matching builder for name %s and '
119              'internal status %s. This is normal if the builder '
120              'is no longer running tests (e.g. it was '
121              'experimental).', builder_name, internal)
122          continue
123
124        if matching_builder in matched_builders:
125          raise RuntimeError(
126              f'Got query result batches matched to builder '
127              f'{matching_builder} twice - this is indicative of a malformed '
128              f'query returning results that are not sorted by builder')
129        matched_builders.add(matching_builder)
130
131        prefixed_builder_name = '%s/%s:%s' % (matching_builder.project,
132                                              matching_builder.builder_type,
133                                              matching_builder.name)
134        unmatched_results = expectation_map.AddResultList(
135            prefixed_builder_name, results, expectation_files)
136        if self._keep_unmatched_results:
137          if unmatched_results:
138            all_unmatched_results[prefixed_builder_name] = unmatched_results
139        else:
140          logging.info('Dropping %d unmatched results', len(unmatched_results))
141
142    logging.debug('Filling expectation map took %f', time.time() - start_time)
143    return all_unmatched_results
144
145  def GetBuilderGroupedQueryResults(
146      self, builder_type: str, is_internal: bool
147  ) -> Generator[Tuple[str, data_types.ResultListType, Optional[List[str]]],
148                 None, None]:
149    """Generates results for all relevant builders grouped by builder name.
150
151    Args:
152      builder_type: Whether the builders are CI or try builders.
153      is_internal: Whether the builders are internal.
154
155    Yields:
156      A tuple (builder_name, results). |builder_name| is a string specifying the
157      builder that |results| came from. |results| is a data_types.ResultListType
158      containing all the results for |builder_name|.
159    """
160    if builder_type == constants.BuilderTypes.CI:
161      if is_internal:
162        query = self._GetInternalCiQuery()
163      else:
164        query = self._GetPublicCiQuery()
165    elif builder_type == constants.BuilderTypes.TRY:
166      if is_internal:
167        query = self._GetInternalTryQuery()
168      else:
169        query = self._GetPublicTryQuery()
170    else:
171      raise RuntimeError(f'Unknown builder type {builder_type}')
172
173    current_builder = None
174    rows_for_builder = []
175    for row in self._GetSeriesForQuery(query):
176      if current_builder is None:
177        current_builder = row.builder_name
178      if row.builder_name != current_builder:
179        results_for_builder, expectation_files = self._ProcessRowsForBuilder(
180            rows_for_builder)
181        # The processing should have cleared out all the stored rows.
182        assert not rows_for_builder
183        yield current_builder, results_for_builder, expectation_files
184        current_builder = row.builder_name
185      rows_for_builder.append(row)
186
187    if current_builder is None:
188      logging.warning(
189          'Did not get any results for builder type %s and internal status %s. '
190          'Depending on where tests are run and how frequently trybots are '
191          'used for submission, this may be benign.', builder_type, is_internal)
192
193    if current_builder is not None and rows_for_builder:
194      results_for_builder, expectation_files = self._ProcessRowsForBuilder(
195          rows_for_builder)
196      assert not rows_for_builder
197      yield current_builder, results_for_builder, expectation_files
198
199  def _GetSeriesForQuery(self,
200                         query: str) -> Generator[pandas.Series, None, None]:
201    """Generates results for |query|.
202
203    Args:
204      query: A string containing the BigQuery query to run.
205
206    Yields:
207      A pandas.Series object for each row returned by the query. Columns can be
208      accessed directly as attributes.
209    """
210    client = bigquery.Client(project=self._project)
211    job = client.query(query)
212    row_iterator = job.result()
213    # Using a Dataframe iterator instead of directly using |row_iterator| allows
214    # us to use the BigQuery Storage API, which results in ~10x faster query
215    # result retrieval at the cost of a few more dependencies.
216    dataframe_iterator = row_iterator.to_dataframe_iterable(
217        bigquery_storage.BigQueryReadClient())
218    for df in dataframe_iterator:
219      for _, row in df.iterrows():
220        yield row
221
222  def _GetPublicCiQuery(self) -> str:
223    """Returns the BigQuery query for public CI builder results."""
224    raise NotImplementedError()
225
226  def _GetInternalCiQuery(self) -> str:
227    """Returns the BigQuery query for internal CI builder results."""
228    raise NotImplementedError()
229
230  def _GetPublicTryQuery(self) -> str:
231    """Returns the BigQuery query for public try builder results."""
232    raise NotImplementedError()
233
234  def _GetInternalTryQuery(self) -> str:
235    """Returns the BigQuery query for internal try builder results."""
236    raise NotImplementedError()
237
238  def _ProcessRowsForBuilder(
239      self, rows: List[QueryResult]
240  ) -> Tuple[data_types.ResultListType, Optional[List[str]]]:
241    """Processes rows from a query into data_types.Result representations.
242
243    Args:
244      rows: A list of rows from a BigQuery query.
245
246    Returns:
247      A tuple (results, expectation_files). |results| is a list of
248      data_types.Result objects. |expectation_files| is the list of expectation
249      files that are used by the tests in |results|, but can be None to specify
250      that all expectation files should be considered.
251    """
252    # It's possible that a builder runs multiple versions of a test with
253    # different expectation files for each version. So, find a result for each
254    # unique step and get the expectation files from all of them.
255    results_for_each_step = {}
256    for r in rows:
257      step_name = r.step_name
258      if step_name not in results_for_each_step:
259        results_for_each_step[step_name] = r
260
261    expectation_files = set()
262    for r in results_for_each_step.values():
263      # None is a special value indicating "use all expectation files", so
264      # handle that.
265      ef = self._GetRelevantExpectationFilesForQueryResult(r)
266      if ef is None:
267        expectation_files = None
268        break
269      expectation_files |= set(ef)
270    if expectation_files is not None:
271      expectation_files = list(expectation_files)
272
273    # The query result list is potentially very large, so reduce the list as we
274    # iterate over it instead of using a standard for/in so that we don't
275    # temporarily end up with a ~2x increase in memory.
276    results = []
277    while rows:
278      r = rows.pop()
279      if self._ShouldSkipOverResult(r):
280        continue
281      results.append(self._ConvertBigQueryRowToResultObject(r))
282
283    return results, expectation_files
284
285  def _ConvertBigQueryRowToResultObject(self,
286                                        row: QueryResult) -> data_types.Result:
287    """Converts a single BigQuery result row to a data_types.Result.
288
289    Args:
290      row: A single row from BigQuery.
291
292    Returns:
293      A data_types.Result object containing the information from |row|.
294    """
295    build_id = _StripPrefixFromBuildId(row.id)
296    test_name = self._StripPrefixFromTestId(row.test_id)
297    actual_result = _ConvertActualResultToExpectationFileFormat(row.status)
298    tags = expectations.GetInstance().FilterToKnownTags(row.typ_tags)
299    step = row.step_name
300    return data_types.Result(test_name, tags, actual_result, step, build_id)
301
302  def _GetRelevantExpectationFilesForQueryResult(
303      self, query_result: QueryResult) -> Optional[Iterable[str]]:
304    """Gets the relevant expectation file names for a given query result.
305
306    Args:
307      query_result: An object representing a row/result from a query. Columns
308          can be accessed via .column_name.
309
310    Returns:
311      An iterable of strings containing expectation file names that are
312      relevant to |query_result|, or None if all expectation files should be
313      considered relevant.
314    """
315    raise NotImplementedError()
316
317  def _ShouldSkipOverResult(self, result: QueryResult) -> bool:
318    """Whether |result| should be ignored and skipped over.
319
320    Args:
321      result: A dict containing a single BigQuery result row.
322
323    Returns:
324      True if the result should be skipped over/ignored, otherwise False.
325    """
326    del result
327    return False
328
329  def _StripPrefixFromTestId(self, test_id: str) -> str:
330    """Strips the prefix from a test ID, leaving only the test case name.
331
332    Args:
333      test_id: A string containing a full ResultDB test ID, e.g.
334          ninja://target/directory.suite.class.test_case
335
336    Returns:
337      A string containing the test cases name extracted from |test_id|.
338    """
339    raise NotImplementedError()
340
341
342def _StripPrefixFromBuildId(build_id: str) -> str:
343  # Build IDs provided by ResultDB are prefixed with "build-"
344  split_id = build_id.split('-')
345  assert len(split_id) == 2
346  return split_id[-1]
347
348
349def _ConvertActualResultToExpectationFileFormat(actual_result: str) -> str:
350  # Web tests use ResultDB's ABORT value for both test timeouts and device
351  # failures, but Abort is not defined in typ. So, map it to timeout now.
352  if actual_result == 'ABORT':
353    actual_result = json_results.ResultType.Timeout
354  # The result reported to ResultDB is in the format PASS/FAIL, while the
355  # expected results in an expectation file are in the format Pass/Failure.
356  return expectations_parser.RESULT_TAGS[actual_result]
357