1#!/usr/bin/python2 2 3# Copyright 2014 Google Inc. 4# 5# Use of this source code is governed by a BSD-style license that can be 6# found in the LICENSE file. 7 8"""Skia's Chromium Codereview Comparison Script. 9 10This script takes two Codereview URLs, looks at the trybot results for 11the two codereviews and compares the results. 12 13Usage: 14 compare_codereview.py CONTROL_URL ROLL_URL 15""" 16 17import collections 18import os 19import re 20import sys 21import urllib2 22import HTMLParser 23 24 25class CodeReviewHTMLParser(HTMLParser.HTMLParser): 26 """Parses CodeReview web page. 27 28 Use the CodeReviewHTMLParser.parse static function to make use of 29 this class. 30 31 This uses the HTMLParser class because it's the best thing in 32 Python's standard library. We need a little more power than a 33 regex. [Search for "You can't parse [X]HTML with regex." for more 34 information. 35 """ 36 # pylint: disable=I0011,R0904 37 @staticmethod 38 def parse(url): 39 """Parses a CodeReview web pages. 40 41 Args: 42 url (string), a codereview URL like this: 43 'https://codereview.chromium.org/?????????'. 44 45 Returns: 46 A dictionary; the keys are bot_name strings, the values 47 are CodeReviewHTMLParser.Status objects 48 """ 49 parser = CodeReviewHTMLParser() 50 try: 51 parser.feed(urllib2.urlopen(url).read()) 52 except (urllib2.URLError,): 53 print >> sys.stderr, 'Error getting', url 54 return None 55 parser.close() 56 return parser.statuses 57 58 # namedtuples are like lightweight structs in Python. The low 59 # overhead of a tuple, but the ease of use of an object. 60 Status = collections.namedtuple('Status', ['status', 'url']) 61 62 def __init__(self): 63 HTMLParser.HTMLParser.__init__(self) 64 self._id = None 65 self._status = None 66 self._href = None 67 self._anchor_data = '' 68 self._currently_parsing_trybotdiv = False 69 # statuses is a dictionary of CodeReviewHTMLParser.Status 70 self.statuses = {} 71 72 def handle_starttag(self, tag, attrs): 73 """Overrides the HTMLParser method to implement functionality. 74 75 [[begin standard library documentation]] 76 This method is called to handle the start of a tag 77 (e.g. <div id="main">). 78 79 The tag argument is the name of the tag converted to lower 80 case. The attrs argument is a list of (name, value) pairs 81 containing the attributes found inside the tag's <> 82 brackets. The name will be translated to lower case, and 83 quotes in the value have been removed, and character and 84 entity references have been replaced. 85 86 For instance, for the tag <A HREF="http://www.cwi.nl/">, this 87 method would be called as handle_starttag('a', [('href', 88 'http://www.cwi.nl/')]). 89 [[end standard library documentation]] 90 """ 91 attrs = dict(attrs) 92 if tag == 'div': 93 # We are looking for <div id="tryjobdiv*">. 94 id_attr = attrs.get('id','') 95 if id_attr.startswith('tryjobdiv'): 96 self._id = id_attr 97 if (self._id and tag == 'a' 98 and 'build-result' in attrs.get('class', '').split()): 99 # If we are already inside a <div id="tryjobdiv*">, we 100 # look for a link if the form 101 # <a class="build-result" href="*">. Then we save the 102 # (non-standard) status attribute and the URL. 103 self._status = attrs.get('status') 104 self._href = attrs.get('href') 105 self._currently_parsing_trybotdiv = True 106 # Start saving anchor data. 107 108 def handle_data(self, data): 109 """Overrides the HTMLParser method to implement functionality. 110 111 [[begin standard library documentation]] 112 This method is called to process arbitrary data (e.g. text 113 nodes and the content of <script>...</script> and 114 <style>...</style>). 115 [[end standard library documentation]] 116 """ 117 # Save the text inside the <a></a> tags. Assume <a> tags 118 # aren't nested. 119 if self._currently_parsing_trybotdiv: 120 self._anchor_data += data 121 122 def handle_endtag(self, tag): 123 """Overrides the HTMLParser method to implement functionality. 124 125 [[begin standard library documentation]] 126 This method is called to handle the end tag of an element 127 (e.g. </div>). The tag argument is the name of the tag 128 converted to lower case. 129 [[end standard library documentation]] 130 """ 131 if tag == 'a' and self._status: 132 # We take the accumulated self._anchor_data and save it as 133 # the bot name. 134 bot = self._anchor_data.strip() 135 stat = CodeReviewHTMLParser.Status(status=self._status, 136 url=self._href) 137 if bot: 138 # Add to accumulating dictionary. 139 self.statuses[bot] = stat 140 # Reset state to search for the next bot. 141 self._currently_parsing_trybotdiv = False 142 self._anchor_data = '' 143 self._status = None 144 self._href = None 145 146 147class BuilderHTMLParser(HTMLParser.HTMLParser): 148 """parses Trybot web pages. 149 150 Use the BuilderHTMLParser.parse static function to make use of 151 this class. 152 153 This uses the HTMLParser class because it's the best thing in 154 Python's standard library. We need a little more power than a 155 regex. [Search for "You can't parse [X]HTML with regex." for more 156 information. 157 """ 158 # pylint: disable=I0011,R0904 159 @staticmethod 160 def parse(url): 161 """Parses a Trybot web page. 162 163 Args: 164 url (string), a trybot result URL. 165 166 Returns: 167 An array of BuilderHTMLParser.Results, each a description 168 of failure results, along with an optional url 169 """ 170 parser = BuilderHTMLParser() 171 try: 172 parser.feed(urllib2.urlopen(url).read()) 173 except (urllib2.URLError,): 174 print >> sys.stderr, 'Error getting', url 175 return [] 176 parser.close() 177 return parser.failure_results 178 179 Result = collections.namedtuple('Result', ['text', 'url']) 180 181 def __init__(self): 182 HTMLParser.HTMLParser.__init__(self) 183 self.failure_results = [] 184 self._current_failure_result = None 185 self._divlevel = None 186 self._li_level = 0 187 self._li_data = '' 188 self._current_failure = False 189 self._failure_results_url = '' 190 191 def handle_starttag(self, tag, attrs): 192 """Overrides the HTMLParser method to implement functionality. 193 194 [[begin standard library documentation]] 195 This method is called to handle the start of a tag 196 (e.g. <div id="main">). 197 198 The tag argument is the name of the tag converted to lower 199 case. The attrs argument is a list of (name, value) pairs 200 containing the attributes found inside the tag's <> 201 brackets. The name will be translated to lower case, and 202 quotes in the value have been removed, and character and 203 entity references have been replaced. 204 205 For instance, for the tag <A HREF="http://www.cwi.nl/">, this 206 method would be called as handle_starttag('a', [('href', 207 'http://www.cwi.nl/')]). 208 [[end standard library documentation]] 209 """ 210 attrs = dict(attrs) 211 if tag == 'li': 212 # <li> tags can be nested. So we have to count the 213 # nest-level for backing out. 214 self._li_level += 1 215 return 216 if tag == 'div' and attrs.get('class') == 'failure result': 217 # We care about this sort of thing: 218 # <li> 219 # <li> 220 # <li> 221 # <div class="failure result">...</div> 222 # </li> 223 # </li> 224 # We want this text here. 225 # </li> 226 if self._li_level > 0: 227 self._current_failure = True # Tells us to keep text. 228 return 229 230 if tag == 'a' and self._current_failure: 231 href = attrs.get('href') 232 # Sometimes we want to keep the stdio url. We always 233 # return it, just in case. 234 if href.endswith('/logs/stdio'): 235 self._failure_results_url = href 236 237 def handle_data(self, data): 238 """Overrides the HTMLParser method to implement functionality. 239 240 [[begin standard library documentation]] 241 This method is called to process arbitrary data (e.g. text 242 nodes and the content of <script>...</script> and 243 <style>...</style>). 244 [[end standard library documentation]] 245 """ 246 if self._current_failure: 247 self._li_data += data 248 249 def handle_endtag(self, tag): 250 """Overrides the HTMLParser method to implement functionality. 251 252 [[begin standard library documentation]] 253 This method is called to handle the end tag of an element 254 (e.g. </div>). The tag argument is the name of the tag 255 converted to lower case. 256 [[end standard library documentation]] 257 """ 258 if tag == 'li': 259 self._li_level -= 1 260 if 0 == self._li_level: 261 if self._current_failure: 262 result = self._li_data.strip() 263 first = result.split()[0] 264 if first: 265 result = re.sub( 266 r'^%s(\s+%s)+' % (first, first), first, result) 267 # Sometimes, it repeats the same thing 268 # multiple times. 269 result = re.sub(r'unexpected flaky.*', '', result) 270 # Remove some extra unnecessary text. 271 result = re.sub(r'\bpreamble\b', '', result) 272 result = re.sub(r'\bstdio\b', '', result) 273 url = self._failure_results_url 274 self.failure_results.append( 275 BuilderHTMLParser.Result(result, url)) 276 self._current_failure_result = None 277 # Reset the state. 278 self._current_failure = False 279 self._li_data = '' 280 self._failure_results_url = '' 281 282 283def printer(indent, string): 284 """Print indented, wrapped text. 285 """ 286 def wrap_to(line, columns): 287 """Wrap a line to the given number of columns, return a list 288 of strings. 289 """ 290 ret = [] 291 nextline = '' 292 for word in line.split(): 293 if nextline: 294 if len(nextline) + 1 + len(word) > columns: 295 ret.append(nextline) 296 nextline = word 297 else: 298 nextline += (' ' + word) 299 else: 300 nextline = word 301 if nextline: 302 ret.append(nextline) 303 return ret 304 out = sys.stdout 305 spacer = ' ' 306 for line in string.split('\n'): 307 for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))): 308 out.write(spacer * indent) 309 if i > 0: 310 out.write(spacer) 311 out.write(wrapped_line) 312 out.write('\n') 313 out.flush() 314 315 316def main(control_url, roll_url, verbosity=1): 317 """Compare two Codereview URLs 318 319 Args: 320 control_url, roll_url: (strings) URL of the format 321 https://codereview.chromium.org/????????? 322 323 verbosity: (int) verbose level. 0, 1, or 2. 324 """ 325 # pylint: disable=I0011,R0914,R0912 326 control = CodeReviewHTMLParser.parse(control_url) 327 roll = CodeReviewHTMLParser.parse(roll_url) 328 all_bots = set(control) & set(roll) # Set intersection. 329 if not all_bots: 330 print >> sys.stderr, ( 331 'Error: control %s and roll %s have no common trybots.' 332 % (list(control), list(roll))) 333 return 334 335 control_name = '[control %s]' % control_url.split('/')[-1] 336 roll_name = '[roll %s]' % roll_url.split('/')[-1] 337 338 out = sys.stdout 339 340 for bot in sorted(all_bots): 341 if (roll[bot].status == 'success'): 342 if verbosity > 1: 343 printer(0, '==%s==' % bot) 344 printer(1, 'OK') 345 continue 346 347 if control[bot].status != 'failure' and roll[bot].status != 'failure': 348 continue 349 printer(0, '==%s==' % bot) 350 351 formatted_results = [] 352 for (status, name, url) in [ 353 (control[bot].status, control_name, control[bot].url), 354 ( roll[bot].status, roll_name, roll[bot].url)]: 355 lines = [] 356 if status == 'failure': 357 results = BuilderHTMLParser.parse(url) 358 for result in results: 359 formatted_result = re.sub(r'(\S*\.html) ', '\n__\g<1>\n', result.text) 360 # Strip runtimes. 361 formatted_result = re.sub(r'\(.*\)', '', formatted_result) 362 lines.append((2, formatted_result)) 363 if ('compile' in result.text or '...and more' in result.text): 364 lines.append((3, re.sub('/[^/]*$', '/', url) + result.url)) 365 formatted_results.append(lines) 366 367 identical = formatted_results[0] == formatted_results[1] 368 369 370 for (formatted_result, (status, name, url)) in zip( 371 formatted_results, 372 [(control[bot].status, control_name, control[bot].url), 373 (roll[bot].status, roll_name, roll[bot].url)]): 374 if status != 'failure' and not identical: 375 printer(1, name) 376 printer(2, status) 377 elif status == 'failure': 378 if identical: 379 printer(1, control_name + ' and ' + roll_name + ' failed identically') 380 else: 381 printer(1, name) 382 for (indent, line) in formatted_result: 383 printer(indent, line) 384 if identical: 385 break 386 out.write('\n') 387 388 if verbosity > 0: 389 # Print out summary of all of the bots. 390 out.write('%11s %11s %4s %s\n\n' % 391 ('CONTROL', 'ROLL', 'DIFF', 'BOT')) 392 for bot in sorted(all_bots): 393 if roll[bot].status == 'success': 394 diff = '' 395 elif (control[bot].status == 'success' and 396 roll[bot].status == 'failure'): 397 diff = '!!!!' 398 elif ('pending' in control[bot].status or 399 'pending' in roll[bot].status): 400 diff = '....' 401 else: 402 diff = '****' 403 out.write('%11s %11s %4s %s\n' % ( 404 control[bot].status, roll[bot].status, diff, bot)) 405 out.write('\n') 406 out.flush() 407 408if __name__ == '__main__': 409 if len(sys.argv) < 3: 410 print >> sys.stderr, __doc__ 411 exit(1) 412 main(sys.argv[1], sys.argv[2], 413 int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1))) 414 415