• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #!/usr/bin/python2
2 
3 # Copyright 2014 Google Inc.
4 #
5 # Use of this source code is governed by a BSD-style license that can be
6 # found in the LICENSE file.
7 
8 """Skia's Chromium Codereview Comparison Script.
9 
10 This script takes two Codereview URLs, looks at the trybot results for
11 the two codereviews and compares the results.
12 
13 Usage:
14   compare_codereview.py CONTROL_URL ROLL_URL
15 """
16 
17 import collections
18 import os
19 import re
20 import sys
21 import urllib2
22 import HTMLParser
23 
24 
25 class CodeReviewHTMLParser(HTMLParser.HTMLParser):
26   """Parses CodeReview web page.
27 
28   Use the CodeReviewHTMLParser.parse static function to make use of
29   this class.
30 
31   This uses the HTMLParser class because it's the best thing in
32   Python's standard library.  We need a little more power than a
33   regex.  [Search for "You can't parse [X]HTML with regex." for more
34   information.
35   """
36   # pylint: disable=I0011,R0904
37   @staticmethod
38   def parse(url):
39     """Parses a CodeReview web pages.
40 
41     Args:
42       url (string), a codereview URL like this:
43         'https://codereview.chromium.org/?????????'.
44 
45     Returns:
46       A dictionary; the keys are bot_name strings, the values
47       are CodeReviewHTMLParser.Status objects
48     """
49     parser = CodeReviewHTMLParser()
50     try:
51       parser.feed(urllib2.urlopen(url).read())
52     except (urllib2.URLError,):
53       print >> sys.stderr, 'Error getting', url
54       return None
55     parser.close()
56     return parser.statuses
57 
58   # namedtuples are like lightweight structs in Python.  The low
59   # overhead of a tuple, but the ease of use of an object.
60   Status = collections.namedtuple('Status', ['status', 'url'])
61 
62   def __init__(self):
63     HTMLParser.HTMLParser.__init__(self)
64     self._id = None
65     self._status = None
66     self._href = None
67     self._anchor_data = ''
68     self._currently_parsing_trybotdiv = False
69     # statuses is a dictionary of CodeReviewHTMLParser.Status
70     self.statuses = {}
71 
72   def handle_starttag(self, tag, attrs):
73     """Overrides the HTMLParser method to implement functionality.
74 
75     [[begin standard library documentation]]
76     This method is called to handle the start of a tag
77     (e.g. <div id="main">).
78 
79     The tag argument is the name of the tag converted to lower
80     case. The attrs argument is a list of (name, value) pairs
81     containing the attributes found inside the tag's <>
82     brackets. The name will be translated to lower case, and
83     quotes in the value have been removed, and character and
84     entity references have been replaced.
85 
86     For instance, for the tag <A HREF="http://www.cwi.nl/">, this
87     method would be called as handle_starttag('a', [('href',
88     'http://www.cwi.nl/')]).
89     [[end standard library documentation]]
90     """
91     attrs = dict(attrs)
92     if tag == 'div':
93       # We are looking for <div id="tryjobdiv*">.
94       id_attr = attrs.get('id','')
95       if id_attr.startswith('tryjobdiv'):
96         self._id = id_attr
97     if (self._id and tag == 'a'
98       and 'build-result' in attrs.get('class', '').split()):
99       # If we are already inside a <div id="tryjobdiv*">, we
100       # look for a link if the form
101       # <a class="build-result" href="*">.  Then we save the
102       # (non-standard) status attribute and the URL.
103       self._status = attrs.get('status')
104       self._href = attrs.get('href')
105       self._currently_parsing_trybotdiv = True
106       # Start saving anchor data.
107 
108   def handle_data(self, data):
109     """Overrides the HTMLParser method to implement functionality.
110 
111     [[begin standard library documentation]]
112     This method is called to process arbitrary data (e.g. text
113     nodes and the content of <script>...</script> and
114     <style>...</style>).
115     [[end standard library documentation]]
116     """
117     # Save the text inside the <a></a> tags.  Assume <a> tags
118     # aren't nested.
119     if self._currently_parsing_trybotdiv:
120       self._anchor_data += data
121 
122   def handle_endtag(self, tag):
123     """Overrides the HTMLParser method to implement functionality.
124 
125     [[begin standard library documentation]]
126     This method is called to handle the end tag of an element
127     (e.g. </div>).  The tag argument is the name of the tag
128     converted to lower case.
129     [[end standard library documentation]]
130     """
131     if tag == 'a' and self._status:
132       # We take the accumulated self._anchor_data and save it as
133       # the bot name.
134       bot = self._anchor_data.strip()
135       stat = CodeReviewHTMLParser.Status(status=self._status,
136                          url=self._href)
137       if bot:
138         # Add to accumulating dictionary.
139         self.statuses[bot] = stat
140       # Reset state to search for the next bot.
141       self._currently_parsing_trybotdiv = False
142       self._anchor_data = ''
143       self._status = None
144       self._href = None
145 
146 
147 class BuilderHTMLParser(HTMLParser.HTMLParser):
148   """parses Trybot web pages.
149 
150   Use the BuilderHTMLParser.parse static function to make use of
151   this class.
152 
153   This uses the HTMLParser class because it's the best thing in
154   Python's standard library.  We need a little more power than a
155   regex.  [Search for "You can't parse [X]HTML with regex." for more
156   information.
157   """
158   # pylint: disable=I0011,R0904
159   @staticmethod
160   def parse(url):
161     """Parses a Trybot web page.
162 
163     Args:
164       url (string), a trybot result URL.
165 
166     Returns:
167       An array of BuilderHTMLParser.Results, each a description
168       of failure results, along with an optional url
169     """
170     parser = BuilderHTMLParser()
171     try:
172       parser.feed(urllib2.urlopen(url).read())
173     except (urllib2.URLError,):
174       print >> sys.stderr, 'Error getting', url
175       return []
176     parser.close()
177     return parser.failure_results
178 
179   Result = collections.namedtuple('Result', ['text', 'url'])
180 
181   def __init__(self):
182     HTMLParser.HTMLParser.__init__(self)
183     self.failure_results = []
184     self._current_failure_result = None
185     self._divlevel = None
186     self._li_level = 0
187     self._li_data = ''
188     self._current_failure = False
189     self._failure_results_url = ''
190 
191   def handle_starttag(self, tag, attrs):
192     """Overrides the HTMLParser method to implement functionality.
193 
194     [[begin standard library documentation]]
195     This method is called to handle the start of a tag
196     (e.g. <div id="main">).
197 
198     The tag argument is the name of the tag converted to lower
199     case. The attrs argument is a list of (name, value) pairs
200     containing the attributes found inside the tag's <>
201     brackets. The name will be translated to lower case, and
202     quotes in the value have been removed, and character and
203     entity references have been replaced.
204 
205     For instance, for the tag <A HREF="http://www.cwi.nl/">, this
206     method would be called as handle_starttag('a', [('href',
207     'http://www.cwi.nl/')]).
208     [[end standard library documentation]]
209     """
210     attrs = dict(attrs)
211     if tag == 'li':
212       # <li> tags can be nested.  So we have to count the
213       # nest-level for backing out.
214       self._li_level += 1
215       return
216     if tag == 'div' and attrs.get('class') == 'failure result':
217       # We care about this sort of thing:
218       # <li>
219       #   <li>
220       #   <li>
221       #     <div class="failure result">...</div>
222       #   </li>
223       #   </li>
224       #   We want this text here.
225       # </li>
226       if self._li_level > 0:
227         self._current_failure = True  # Tells us to keep text.
228       return
229 
230     if tag == 'a' and self._current_failure:
231       href = attrs.get('href')
232       # Sometimes we want to keep the stdio url.  We always
233       # return it, just in case.
234       if href.endswith('/logs/stdio'):
235         self._failure_results_url = href
236 
237   def handle_data(self, data):
238     """Overrides the HTMLParser method to implement functionality.
239 
240     [[begin standard library documentation]]
241     This method is called to process arbitrary data (e.g. text
242     nodes and the content of <script>...</script> and
243     <style>...</style>).
244     [[end standard library documentation]]
245     """
246     if self._current_failure:
247       self._li_data += data
248 
249   def handle_endtag(self, tag):
250     """Overrides the HTMLParser method to implement functionality.
251 
252     [[begin standard library documentation]]
253     This method is called to handle the end tag of an element
254     (e.g. </div>).  The tag argument is the name of the tag
255     converted to lower case.
256     [[end standard library documentation]]
257     """
258     if tag == 'li':
259       self._li_level -= 1
260       if 0 == self._li_level:
261         if self._current_failure:
262           result = self._li_data.strip()
263           first = result.split()[0]
264           if first:
265             result = re.sub(
266               r'^%s(\s+%s)+' % (first, first), first, result)
267             # Sometimes, it repeats the same thing
268             # multiple times.
269           result = re.sub(r'unexpected flaky.*', '', result)
270           # Remove some extra unnecessary text.
271           result = re.sub(r'\bpreamble\b', '', result)
272           result = re.sub(r'\bstdio\b', '', result)
273           url = self._failure_results_url
274           self.failure_results.append(
275             BuilderHTMLParser.Result(result, url))
276           self._current_failure_result = None
277         # Reset the state.
278         self._current_failure = False
279         self._li_data = ''
280         self._failure_results_url = ''
281 
282 
283 def printer(indent, string):
284   """Print indented, wrapped text.
285   """
286   def wrap_to(line, columns):
287     """Wrap a line to the given number of columns, return a list
288     of strings.
289     """
290     ret = []
291     nextline = ''
292     for word in line.split():
293       if nextline:
294         if len(nextline) + 1 + len(word) > columns:
295           ret.append(nextline)
296           nextline = word
297         else:
298           nextline += (' ' + word)
299       else:
300         nextline = word
301     if nextline:
302       ret.append(nextline)
303     return ret
304   out = sys.stdout
305   spacer = '  '
306   for line in string.split('\n'):
307     for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))):
308       out.write(spacer * indent)
309       if i > 0:
310         out.write(spacer)
311       out.write(wrapped_line)
312       out.write('\n')
313   out.flush()
314 
315 
316 def main(control_url, roll_url, verbosity=1):
317   """Compare two Codereview URLs
318 
319   Args:
320     control_url, roll_url: (strings) URL of the format
321       https://codereview.chromium.org/?????????
322 
323     verbosity: (int) verbose level.  0, 1, or 2.
324   """
325   # pylint: disable=I0011,R0914,R0912
326   control = CodeReviewHTMLParser.parse(control_url)
327   roll = CodeReviewHTMLParser.parse(roll_url)
328   all_bots = set(control) & set(roll)  # Set intersection.
329   if not all_bots:
330     print >> sys.stderr, (
331       'Error:  control %s and roll %s have no common trybots.'
332       % (list(control), list(roll)))
333     return
334 
335   control_name = '[control %s]' % control_url.split('/')[-1]
336   roll_name = '[roll %s]' % roll_url.split('/')[-1]
337 
338   out = sys.stdout
339 
340   for bot in sorted(all_bots):
341     if (roll[bot].status == 'success'):
342       if verbosity > 1:
343         printer(0, '==%s==' % bot)
344         printer(1, 'OK')
345       continue
346 
347     if control[bot].status != 'failure' and roll[bot].status != 'failure':
348       continue
349     printer(0, '==%s==' % bot)
350 
351     formatted_results = []
352     for (status, name, url) in [
353             (control[bot].status, control_name, control[bot].url),
354             (   roll[bot].status,    roll_name,    roll[bot].url)]:
355       lines = []
356       if status == 'failure':
357         results = BuilderHTMLParser.parse(url)
358         for result in results:
359           formatted_result = re.sub(r'(\S*\.html) ', '\n__\g<1>\n', result.text)
360           # Strip runtimes.
361           formatted_result = re.sub(r'\(.*\)', '', formatted_result)
362           lines.append((2, formatted_result))
363           if ('compile' in result.text or '...and more' in result.text):
364             lines.append((3, re.sub('/[^/]*$', '/', url) + result.url))
365       formatted_results.append(lines)
366 
367     identical = formatted_results[0] == formatted_results[1]
368 
369 
370     for (formatted_result, (status, name, url)) in zip(
371         formatted_results,
372         [(control[bot].status, control_name, control[bot].url),
373           (roll[bot].status,  roll_name,  roll[bot].url)]):
374       if status != 'failure' and not identical:
375         printer(1, name)
376         printer(2, status)
377       elif status == 'failure':
378         if identical:
379           printer(1, control_name + ' and ' + roll_name + ' failed identically')
380         else:
381           printer(1, name)
382         for (indent, line) in formatted_result:
383           printer(indent, line)
384         if identical:
385           break
386     out.write('\n')
387 
388   if verbosity > 0:
389     # Print out summary of all of the bots.
390     out.write('%11s %11s %4s %s\n\n' %
391           ('CONTROL', 'ROLL', 'DIFF', 'BOT'))
392     for bot in sorted(all_bots):
393       if roll[bot].status == 'success':
394         diff = ''
395       elif (control[bot].status == 'success' and
396            roll[bot].status == 'failure'):
397         diff = '!!!!'
398       elif ('pending' in control[bot].status or
399           'pending' in roll[bot].status):
400         diff = '....'
401       else:
402         diff = '****'
403       out.write('%11s %11s %4s %s\n' % (
404           control[bot].status, roll[bot].status, diff, bot))
405     out.write('\n')
406     out.flush()
407 
408 if __name__ == '__main__':
409   if len(sys.argv) < 3:
410     print >> sys.stderr, __doc__
411     exit(1)
412   main(sys.argv[1], sys.argv[2],
413      int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1)))
414 
415