# Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import math import os import bisect_utils import math_utils import ttest def ConfidenceScore(good_results_lists, bad_results_lists): """Calculates a confidence score. This score is a percentage which represents our degree of confidence in the proposition that the good results and bad results are distinct groups, and their differences aren't due to chance alone. Args: good_results_lists: A list of lists of "good" result numbers. bad_results_lists: A list of lists of "bad" result numbers. Returns: A number in the range [0, 100]. """ # If there's only one item in either list, this means only one revision was # classified good or bad; this isn't good enough evidence to make a decision. # If an empty list was passed, that also implies zero confidence. if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: return 0.0 # Flatten the lists of results lists. sample1 = sum(good_results_lists, []) sample2 = sum(bad_results_lists, []) # If there were only empty lists in either of the lists (this is unexpected # and normally shouldn't happen), then we also want to return 0. if not sample1 or not sample2: return 0.0 # The p-value is approximately the probability of obtaining the given set # of good and bad values just by chance. _, _, p_value = ttest.WelchsTTest(sample1, sample2) return 100.0 * (1.0 - p_value) class BisectResults(object): def __init__(self, depot_registry, source_control): self._depot_registry = depot_registry self.revision_data = {} self.error = None self._source_control = source_control @staticmethod def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): """Compiles a list of other possible regressions from the revision data. Args: revision_data_sorted: Sorted list of (revision, revision data) pairs. bad_greater_than_good: Whether the result value at the "bad" revision is numerically greater than the result value at the "good" revision. Returns: A list of [current_rev, previous_rev, confidence] for other places where there may have been a regression. """ other_regressions = [] previous_values = [] previous_id = None for current_id, current_data in revision_data_sorted: current_values = current_data['value'] if current_values: current_values = current_values['values'] if previous_values: confidence = ConfidenceScore(previous_values, [current_values]) mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) mean_of_current_runs = math_utils.Mean(current_values) # Check that the potential regression is in the same direction as # the overall regression. If the mean of the previous runs < the # mean of the current runs, this local regression is in same # direction. prev_less_than_current = mean_of_prev_runs < mean_of_current_runs is_same_direction = (prev_less_than_current if bad_greater_than_good else not prev_less_than_current) # Only report potential regressions with high confidence. if is_same_direction and confidence > 50: other_regressions.append([current_id, previous_id, confidence]) previous_values.append(current_values) previous_id = current_id return other_regressions def GetResultsDict(self): """Prepares and returns information about the final resulsts as a dict. Returns: A dictionary with the following fields 'first_working_revision': First good revision. 'last_broken_revision': Last bad revision. 'culprit_revisions': A list of revisions, which contain the bad change introducing the failure. 'other_regressions': A list of tuples representing other regressions, which may have occured. 'regression_size': For performance bisects, this is a relative change of the mean metric value. For other bisects this field always contains 'zero-to-nonzero'. 'regression_std_err': For performance bisects, it is a pooled standard error for groups of good and bad runs. Not used for other bisects. 'confidence': For performance bisects, it is a confidence that the good and bad runs are distinct groups. Not used for non-performance bisects. 'revision_data_sorted': dict mapping revision ids to data about that revision. Each piece of revision data consists of a dict with the following keys: 'passed': Represents whether the performance test was successful at that revision. Possible values include: 1 (passed), 0 (failed), '?' (skipped), 'F' (build failed). 'depot': The depot that this revision is from (i.e. WebKit) 'external': If the revision is a 'src' revision, 'external' contains the revisions of each of the external libraries. 'sort': A sort value for sorting the dict in order of commits. For example: { 'CL #1': { 'passed': False, 'depot': 'chromium', 'external': None, 'sort': 0 } } """ revision_data_sorted = sorted(self.revision_data.iteritems(), key = lambda x: x[1]['sort']) # Find range where it possibly broke. first_working_revision = None first_working_revision_index = -1 last_broken_revision = None last_broken_revision_index = -1 culprit_revisions = [] other_regressions = [] regression_size = 0.0 regression_std_err = 0.0 confidence = 0.0 for i in xrange(len(revision_data_sorted)): k, v = revision_data_sorted[i] if v['passed'] == 1: if not first_working_revision: first_working_revision = k first_working_revision_index = i if not v['passed']: last_broken_revision = k last_broken_revision_index = i if last_broken_revision != None and first_working_revision != None: broken_means = [] for i in xrange(0, last_broken_revision_index + 1): if revision_data_sorted[i][1]['value']: broken_means.append(revision_data_sorted[i][1]['value']['values']) working_means = [] for i in xrange(first_working_revision_index, len(revision_data_sorted)): if revision_data_sorted[i][1]['value']: working_means.append(revision_data_sorted[i][1]['value']['values']) # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs(math_utils.PooledStandardError( [working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect. At the moment we use how distinct the # values are before and after the last broken revision, and how noisy the # overall graph is. confidence = ConfidenceScore(working_means, broken_means) culprit_revisions = [] cwd = os.getcwd() self._depot_registry.ChangeToDepotDir( self.revision_data[last_broken_revision]['depot']) if self.revision_data[last_broken_revision]['depot'] == 'cros': # Want to get a list of all the commits and what depots they belong # to so that we can grab info about each. cmd = ['repo', 'forall', '-c', 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( last_broken_revision, first_working_revision + 1)] output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) changes = [] assert not return_code, ('An error occurred while running ' '"%s"' % ' '.join(cmd)) last_depot = None cwd = os.getcwd() for l in output.split('\n'): if l: # Output will be in form: # /path_to_depot # /path_to_other_depot # # /path_again # # etc. if l[0] == '/': last_depot = l else: contents = l.split(' ') if len(contents) > 1: changes.append([last_depot, contents[0]]) for c in changes: os.chdir(c[0]) info = self._source_control.QueryRevisionInfo(c[1]) culprit_revisions.append((c[1], info, None)) else: for i in xrange(last_broken_revision_index, len(revision_data_sorted)): k, v = revision_data_sorted[i] if k == first_working_revision: break self._depot_registry.ChangeToDepotDir(v['depot']) info = self._source_control.QueryRevisionInfo(k) culprit_revisions.append((k, info, v['depot'])) os.chdir(cwd) # Check for any other possible regression ranges. other_regressions = self._FindOtherRegressions( revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) return { 'first_working_revision': first_working_revision, 'last_broken_revision': last_broken_revision, 'culprit_revisions': culprit_revisions, 'other_regressions': other_regressions, 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'revision_data_sorted': revision_data_sorted }