• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import math
6import os
7
8import bisect_utils
9import math_utils
10import ttest
11
12
13def ConfidenceScore(good_results_lists, bad_results_lists):
14  """Calculates a confidence score.
15
16  This score is a percentage which represents our degree of confidence in the
17  proposition that the good results and bad results are distinct groups, and
18  their differences aren't due to chance alone.
19
20
21  Args:
22    good_results_lists: A list of lists of "good" result numbers.
23    bad_results_lists: A list of lists of "bad" result numbers.
24
25  Returns:
26    A number in the range [0, 100].
27  """
28  # If there's only one item in either list, this means only one revision was
29  # classified good or bad; this isn't good enough evidence to make a decision.
30  # If an empty list was passed, that also implies zero confidence.
31  if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
32    return 0.0
33
34  # Flatten the lists of results lists.
35  sample1 = sum(good_results_lists, [])
36  sample2 = sum(bad_results_lists, [])
37
38  # If there were only empty lists in either of the lists (this is unexpected
39  # and normally shouldn't happen), then we also want to return 0.
40  if not sample1 or not sample2:
41    return 0.0
42
43  # The p-value is approximately the probability of obtaining the given set
44  # of good and bad values just by chance.
45  _, _, p_value = ttest.WelchsTTest(sample1, sample2)
46  return 100.0 * (1.0 - p_value)
47
48
49class BisectResults(object):
50
51  def __init__(self, depot_registry, source_control):
52    self._depot_registry = depot_registry
53    self.revision_data = {}
54    self.error = None
55    self._source_control = source_control
56
57  @staticmethod
58  def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
59    """Compiles a list of other possible regressions from the revision data.
60
61    Args:
62      revision_data_sorted: Sorted list of (revision, revision data) pairs.
63      bad_greater_than_good: Whether the result value at the "bad" revision is
64          numerically greater than the result value at the "good" revision.
65
66    Returns:
67      A list of [current_rev, previous_rev, confidence] for other places where
68      there may have been a regression.
69    """
70    other_regressions = []
71    previous_values = []
72    previous_id = None
73    for current_id, current_data in revision_data_sorted:
74      current_values = current_data['value']
75      if current_values:
76        current_values = current_values['values']
77        if previous_values:
78          confidence = ConfidenceScore(previous_values, [current_values])
79          mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
80          mean_of_current_runs = math_utils.Mean(current_values)
81
82          # Check that the potential regression is in the same direction as
83          # the overall regression. If the mean of the previous runs < the
84          # mean of the current runs, this local regression is in same
85          # direction.
86          prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
87          is_same_direction = (prev_less_than_current if
88              bad_greater_than_good else not prev_less_than_current)
89
90          # Only report potential regressions with high confidence.
91          if is_same_direction and confidence > 50:
92            other_regressions.append([current_id, previous_id, confidence])
93        previous_values.append(current_values)
94        previous_id = current_id
95    return other_regressions
96
97  def GetResultsDict(self):
98    """Prepares and returns information about the final resulsts as a dict.
99
100    Returns:
101      A dictionary with the following fields
102
103      'first_working_revision': First good revision.
104      'last_broken_revision': Last bad revision.
105      'culprit_revisions': A list of revisions, which contain the bad change
106          introducing the failure.
107      'other_regressions': A list of tuples representing other regressions,
108          which may have occured.
109      'regression_size': For performance bisects, this is a relative change of
110          the mean metric value. For other bisects this field always contains
111          'zero-to-nonzero'.
112      'regression_std_err': For performance bisects, it is a pooled standard
113          error for groups of good and bad runs. Not used for other bisects.
114      'confidence': For performance bisects, it is a confidence that the good
115          and bad runs are distinct groups. Not used for non-performance
116          bisects.
117      'revision_data_sorted': dict mapping revision ids to data about that
118          revision. Each piece of revision data consists of a dict with the
119          following keys:
120
121          'passed': Represents whether the performance test was successful at
122              that revision. Possible values include: 1 (passed), 0 (failed),
123              '?' (skipped), 'F' (build failed).
124          'depot': The depot that this revision is from (i.e. WebKit)
125          'external': If the revision is a 'src' revision, 'external' contains
126              the revisions of each of the external libraries.
127          'sort': A sort value for sorting the dict in order of commits.
128
129          For example:
130          {
131            'CL #1':
132            {
133              'passed': False,
134              'depot': 'chromium',
135              'external': None,
136              'sort': 0
137            }
138          }
139    """
140    revision_data_sorted = sorted(self.revision_data.iteritems(),
141                                  key = lambda x: x[1]['sort'])
142
143    # Find range where it possibly broke.
144    first_working_revision = None
145    first_working_revision_index = -1
146    last_broken_revision = None
147    last_broken_revision_index = -1
148
149    culprit_revisions = []
150    other_regressions = []
151    regression_size = 0.0
152    regression_std_err = 0.0
153    confidence = 0.0
154
155    for i in xrange(len(revision_data_sorted)):
156      k, v = revision_data_sorted[i]
157      if v['passed'] == 1:
158        if not first_working_revision:
159          first_working_revision = k
160          first_working_revision_index = i
161
162      if not v['passed']:
163        last_broken_revision = k
164        last_broken_revision_index = i
165
166    if last_broken_revision != None and first_working_revision != None:
167      broken_means = []
168      for i in xrange(0, last_broken_revision_index + 1):
169        if revision_data_sorted[i][1]['value']:
170          broken_means.append(revision_data_sorted[i][1]['value']['values'])
171
172      working_means = []
173      for i in xrange(first_working_revision_index, len(revision_data_sorted)):
174        if revision_data_sorted[i][1]['value']:
175          working_means.append(revision_data_sorted[i][1]['value']['values'])
176
177      # Flatten the lists to calculate mean of all values.
178      working_mean = sum(working_means, [])
179      broken_mean = sum(broken_means, [])
180
181      # Calculate the approximate size of the regression
182      mean_of_bad_runs = math_utils.Mean(broken_mean)
183      mean_of_good_runs = math_utils.Mean(working_mean)
184
185      regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
186                                                      mean_of_bad_runs)
187      if math.isnan(regression_size):
188        regression_size = 'zero-to-nonzero'
189
190      regression_std_err = math.fabs(math_utils.PooledStandardError(
191          [working_mean, broken_mean]) /
192          max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
193
194      # Give a "confidence" in the bisect. At the moment we use how distinct the
195      # values are before and after the last broken revision, and how noisy the
196      # overall graph is.
197      confidence = ConfidenceScore(working_means, broken_means)
198
199      culprit_revisions = []
200
201      cwd = os.getcwd()
202      self._depot_registry.ChangeToDepotDir(
203          self.revision_data[last_broken_revision]['depot'])
204
205      if self.revision_data[last_broken_revision]['depot'] == 'cros':
206        # Want to get a list of all the commits and what depots they belong
207        # to so that we can grab info about each.
208        cmd = ['repo', 'forall', '-c',
209            'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
210            last_broken_revision, first_working_revision + 1)]
211        output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
212
213        changes = []
214        assert not return_code, ('An error occurred while running '
215                                 '"%s"' % ' '.join(cmd))
216        last_depot = None
217        cwd = os.getcwd()
218        for l in output.split('\n'):
219          if l:
220            # Output will be in form:
221            # /path_to_depot
222            # /path_to_other_depot
223            # <SHA1>
224            # /path_again
225            # <SHA1>
226            # etc.
227            if l[0] == '/':
228              last_depot = l
229            else:
230              contents = l.split(' ')
231              if len(contents) > 1:
232                changes.append([last_depot, contents[0]])
233        for c in changes:
234          os.chdir(c[0])
235          info = self._source_control.QueryRevisionInfo(c[1])
236          culprit_revisions.append((c[1], info, None))
237      else:
238        for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
239          k, v = revision_data_sorted[i]
240          if k == first_working_revision:
241            break
242          self._depot_registry.ChangeToDepotDir(v['depot'])
243          info = self._source_control.QueryRevisionInfo(k)
244          culprit_revisions.append((k, info, v['depot']))
245      os.chdir(cwd)
246
247      # Check for any other possible regression ranges.
248      other_regressions = self._FindOtherRegressions(
249          revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
250
251    return {
252        'first_working_revision': first_working_revision,
253        'last_broken_revision': last_broken_revision,
254        'culprit_revisions': culprit_revisions,
255        'other_regressions': other_regressions,
256        'regression_size': regression_size,
257        'regression_std_err': regression_std_err,
258        'confidence': confidence,
259        'revision_data_sorted': revision_data_sorted
260    }
261