1# Copyright 2020 The Chromium Authors 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4"""Methods related to querying the ResultDB BigQuery tables.""" 5 6import logging 7import time 8from typing import Collection, Dict, Generator, Iterable, List, Optional, Tuple 9 10from google.cloud import bigquery 11from google.cloud import bigquery_storage 12import pandas 13 14from typ import expectations_parser 15from typ import json_results 16from unexpected_passes_common import constants 17from unexpected_passes_common import data_types 18from unexpected_passes_common import expectations 19 20DEFAULT_NUM_SAMPLES = 100 21 22# Subquery for getting all try builds that were used for CL submission. 30 days 23# is chosen because the ResultDB tables we pull data from only keep data around 24# for 30 days. 25PARTITIONED_SUBMITTED_BUILDS_TEMPLATE = """\ 26 SELECT 27 CONCAT("build-", CAST(unnested_builds.id AS STRING)) as id 28 FROM 29 `commit-queue.{project_view}.attempts`, 30 UNNEST(builds) as unnested_builds, 31 UNNEST(gerrit_changes) as unnested_changes 32 WHERE 33 unnested_builds.host = "cr-buildbucket.appspot.com" 34 AND unnested_changes.submit_status = "SUCCESS" 35 AND start_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), 36 INTERVAL 30 DAY)""" 37 38QueryResult = pandas.Series 39 40 41class BigQueryQuerier: 42 """Class to handle all BigQuery queries for a script invocation.""" 43 44 def __init__(self, suite: Optional[str], project: str, num_samples: int, 45 keep_unmatched_results: bool): 46 """ 47 Args: 48 suite: A string containing the name of the suite that is being queried 49 for. Can be None if there is no differentiation between different 50 suites. 51 project: A string containing the billing project to use for BigQuery. 52 num_samples: An integer containing the number of builds to pull results 53 from. 54 keep_unmatched_results: Whether to store and return unmatched results 55 for debugging purposes. 56 """ 57 self._suite = suite 58 self._project = project 59 self._num_samples = num_samples or DEFAULT_NUM_SAMPLES 60 self._keep_unmatched_results = keep_unmatched_results 61 62 assert self._num_samples > 0 63 64 def FillExpectationMapForBuilders( 65 self, expectation_map: data_types.TestExpectationMap, 66 builders: Collection[data_types.BuilderEntry] 67 ) -> Dict[str, data_types.ResultListType]: 68 """Fills |expectation_map| with results from |builders|. 69 70 Args: 71 expectation_map: A data_types.TestExpectationMap. Will be modified 72 in-place. 73 builders: An iterable of data_types.BuilderEntry containing the builders 74 to query. 75 76 Returns: 77 A dict containing any results that were retrieved that did not have a 78 matching expectation in |expectation_map| in the following format: 79 { 80 |builder_type|:|builder_name| (str): [ 81 result1 (data_types.Result), 82 result2 (data_types.Result), 83 ... 84 ], 85 } 86 """ 87 start_time = time.time() 88 logging.debug('Starting to fill expectation map for %d builders', 89 len(builders)) 90 assert isinstance(expectation_map, data_types.TestExpectationMap) 91 # Ensure that all the builders are of the same type since we make some 92 # assumptions about that later on. 93 assert builders 94 builder_type = None 95 for b in builders: 96 if builder_type is None: 97 builder_type = b.builder_type 98 else: 99 assert b.builder_type == builder_type 100 101 internal_statuses = set() 102 for b in builders: 103 internal_statuses.add(b.is_internal_builder) 104 105 matched_builders = set() 106 all_unmatched_results = {} 107 for internal in internal_statuses: 108 for builder_name, results, expectation_files in ( 109 self.GetBuilderGroupedQueryResults(builder_type, internal)): 110 matching_builder = None 111 for b in builders: 112 if b.name == builder_name and b.is_internal_builder == internal: 113 matching_builder = b 114 break 115 116 if not matching_builder: 117 logging.warning( 118 'Did not find a matching builder for name %s and ' 119 'internal status %s. This is normal if the builder ' 120 'is no longer running tests (e.g. it was ' 121 'experimental).', builder_name, internal) 122 continue 123 124 if matching_builder in matched_builders: 125 raise RuntimeError( 126 f'Got query result batches matched to builder ' 127 f'{matching_builder} twice - this is indicative of a malformed ' 128 f'query returning results that are not sorted by builder') 129 matched_builders.add(matching_builder) 130 131 prefixed_builder_name = '%s/%s:%s' % (matching_builder.project, 132 matching_builder.builder_type, 133 matching_builder.name) 134 unmatched_results = expectation_map.AddResultList( 135 prefixed_builder_name, results, expectation_files) 136 if self._keep_unmatched_results: 137 if unmatched_results: 138 all_unmatched_results[prefixed_builder_name] = unmatched_results 139 else: 140 logging.info('Dropping %d unmatched results', len(unmatched_results)) 141 142 logging.debug('Filling expectation map took %f', time.time() - start_time) 143 return all_unmatched_results 144 145 def GetBuilderGroupedQueryResults( 146 self, builder_type: str, is_internal: bool 147 ) -> Generator[Tuple[str, data_types.ResultListType, Optional[List[str]]], 148 None, None]: 149 """Generates results for all relevant builders grouped by builder name. 150 151 Args: 152 builder_type: Whether the builders are CI or try builders. 153 is_internal: Whether the builders are internal. 154 155 Yields: 156 A tuple (builder_name, results). |builder_name| is a string specifying the 157 builder that |results| came from. |results| is a data_types.ResultListType 158 containing all the results for |builder_name|. 159 """ 160 if builder_type == constants.BuilderTypes.CI: 161 if is_internal: 162 query = self._GetInternalCiQuery() 163 else: 164 query = self._GetPublicCiQuery() 165 elif builder_type == constants.BuilderTypes.TRY: 166 if is_internal: 167 query = self._GetInternalTryQuery() 168 else: 169 query = self._GetPublicTryQuery() 170 else: 171 raise RuntimeError(f'Unknown builder type {builder_type}') 172 173 current_builder = None 174 rows_for_builder = [] 175 for row in self._GetSeriesForQuery(query): 176 if current_builder is None: 177 current_builder = row.builder_name 178 if row.builder_name != current_builder: 179 results_for_builder, expectation_files = self._ProcessRowsForBuilder( 180 rows_for_builder) 181 # The processing should have cleared out all the stored rows. 182 assert not rows_for_builder 183 yield current_builder, results_for_builder, expectation_files 184 current_builder = row.builder_name 185 rows_for_builder.append(row) 186 187 if current_builder is None: 188 logging.warning( 189 'Did not get any results for builder type %s and internal status %s. ' 190 'Depending on where tests are run and how frequently trybots are ' 191 'used for submission, this may be benign.', builder_type, is_internal) 192 193 if current_builder is not None and rows_for_builder: 194 results_for_builder, expectation_files = self._ProcessRowsForBuilder( 195 rows_for_builder) 196 assert not rows_for_builder 197 yield current_builder, results_for_builder, expectation_files 198 199 def _GetSeriesForQuery(self, 200 query: str) -> Generator[pandas.Series, None, None]: 201 """Generates results for |query|. 202 203 Args: 204 query: A string containing the BigQuery query to run. 205 206 Yields: 207 A pandas.Series object for each row returned by the query. Columns can be 208 accessed directly as attributes. 209 """ 210 client = bigquery.Client(project=self._project) 211 job = client.query(query) 212 row_iterator = job.result() 213 # Using a Dataframe iterator instead of directly using |row_iterator| allows 214 # us to use the BigQuery Storage API, which results in ~10x faster query 215 # result retrieval at the cost of a few more dependencies. 216 dataframe_iterator = row_iterator.to_dataframe_iterable( 217 bigquery_storage.BigQueryReadClient()) 218 for df in dataframe_iterator: 219 for _, row in df.iterrows(): 220 yield row 221 222 def _GetPublicCiQuery(self) -> str: 223 """Returns the BigQuery query for public CI builder results.""" 224 raise NotImplementedError() 225 226 def _GetInternalCiQuery(self) -> str: 227 """Returns the BigQuery query for internal CI builder results.""" 228 raise NotImplementedError() 229 230 def _GetPublicTryQuery(self) -> str: 231 """Returns the BigQuery query for public try builder results.""" 232 raise NotImplementedError() 233 234 def _GetInternalTryQuery(self) -> str: 235 """Returns the BigQuery query for internal try builder results.""" 236 raise NotImplementedError() 237 238 def _ProcessRowsForBuilder( 239 self, rows: List[QueryResult] 240 ) -> Tuple[data_types.ResultListType, Optional[List[str]]]: 241 """Processes rows from a query into data_types.Result representations. 242 243 Args: 244 rows: A list of rows from a BigQuery query. 245 246 Returns: 247 A tuple (results, expectation_files). |results| is a list of 248 data_types.Result objects. |expectation_files| is the list of expectation 249 files that are used by the tests in |results|, but can be None to specify 250 that all expectation files should be considered. 251 """ 252 # It's possible that a builder runs multiple versions of a test with 253 # different expectation files for each version. So, find a result for each 254 # unique step and get the expectation files from all of them. 255 results_for_each_step = {} 256 for r in rows: 257 step_name = r.step_name 258 if step_name not in results_for_each_step: 259 results_for_each_step[step_name] = r 260 261 expectation_files = set() 262 for r in results_for_each_step.values(): 263 # None is a special value indicating "use all expectation files", so 264 # handle that. 265 ef = self._GetRelevantExpectationFilesForQueryResult(r) 266 if ef is None: 267 expectation_files = None 268 break 269 expectation_files |= set(ef) 270 if expectation_files is not None: 271 expectation_files = list(expectation_files) 272 273 # The query result list is potentially very large, so reduce the list as we 274 # iterate over it instead of using a standard for/in so that we don't 275 # temporarily end up with a ~2x increase in memory. 276 results = [] 277 while rows: 278 r = rows.pop() 279 if self._ShouldSkipOverResult(r): 280 continue 281 results.append(self._ConvertBigQueryRowToResultObject(r)) 282 283 return results, expectation_files 284 285 def _ConvertBigQueryRowToResultObject(self, 286 row: QueryResult) -> data_types.Result: 287 """Converts a single BigQuery result row to a data_types.Result. 288 289 Args: 290 row: A single row from BigQuery. 291 292 Returns: 293 A data_types.Result object containing the information from |row|. 294 """ 295 build_id = _StripPrefixFromBuildId(row.id) 296 test_name = self._StripPrefixFromTestId(row.test_id) 297 actual_result = _ConvertActualResultToExpectationFileFormat(row.status) 298 tags = expectations.GetInstance().FilterToKnownTags(row.typ_tags) 299 step = row.step_name 300 return data_types.Result(test_name, tags, actual_result, step, build_id) 301 302 def _GetRelevantExpectationFilesForQueryResult( 303 self, query_result: QueryResult) -> Optional[Iterable[str]]: 304 """Gets the relevant expectation file names for a given query result. 305 306 Args: 307 query_result: An object representing a row/result from a query. Columns 308 can be accessed via .column_name. 309 310 Returns: 311 An iterable of strings containing expectation file names that are 312 relevant to |query_result|, or None if all expectation files should be 313 considered relevant. 314 """ 315 raise NotImplementedError() 316 317 def _ShouldSkipOverResult(self, result: QueryResult) -> bool: 318 """Whether |result| should be ignored and skipped over. 319 320 Args: 321 result: A dict containing a single BigQuery result row. 322 323 Returns: 324 True if the result should be skipped over/ignored, otherwise False. 325 """ 326 del result 327 return False 328 329 def _StripPrefixFromTestId(self, test_id: str) -> str: 330 """Strips the prefix from a test ID, leaving only the test case name. 331 332 Args: 333 test_id: A string containing a full ResultDB test ID, e.g. 334 ninja://target/directory.suite.class.test_case 335 336 Returns: 337 A string containing the test cases name extracted from |test_id|. 338 """ 339 raise NotImplementedError() 340 341 342def _StripPrefixFromBuildId(build_id: str) -> str: 343 # Build IDs provided by ResultDB are prefixed with "build-" 344 split_id = build_id.split('-') 345 assert len(split_id) == 2 346 return split_id[-1] 347 348 349def _ConvertActualResultToExpectationFileFormat(actual_result: str) -> str: 350 # Web tests use ResultDB's ABORT value for both test timeouts and device 351 # failures, but Abort is not defined in typ. So, map it to timeout now. 352 if actual_result == 'ABORT': 353 actual_result = json_results.ResultType.Timeout 354 # The result reported to ResultDB is in the format PASS/FAIL, while the 355 # expected results in an expectation file are in the format Pass/Failure. 356 return expectations_parser.RESULT_TAGS[actual_result] 357