1#!/usr/bin/env python 2 3import argparse 4import email.mime.multipart 5import email.mime.text 6import logging 7import os.path 8import pickle 9import re 10import smtplib 11import subprocess 12import sys 13from datetime import datetime, timedelta 14from phabricator import Phabricator 15 16# Setting up a virtualenv to run this script can be done by running the 17# following commands: 18# $ virtualenv venv 19# $ . ./venv/bin/activate 20# $ pip install Phabricator 21 22GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), ) 23 24# The below PhabXXX classes represent objects as modelled by Phabricator. 25# The classes can be serialized to disk, to try and make sure that we don't 26# needlessly have to re-fetch lots of data from Phabricator, as that would 27# make this script unusably slow. 28 29 30class PhabObject: 31 OBJECT_KIND = None 32 33 def __init__(self, id): 34 self.id = id 35 36 37class PhabObjectCache: 38 def __init__(self, PhabObjectClass): 39 self.PhabObjectClass = PhabObjectClass 40 self.most_recent_info = None 41 self.oldest_info = None 42 self.id2PhabObjects = {} 43 44 def get_name(self): 45 return self.PhabObjectClass.OBJECT_KIND + "sCache" 46 47 def get(self, id): 48 if id not in self.id2PhabObjects: 49 self.id2PhabObjects[id] = self.PhabObjectClass(id) 50 return self.id2PhabObjects[id] 51 52 def get_ids_in_cache(self): 53 return self.id2PhabObjects.keys() 54 55 def get_objects(self): 56 return self.id2PhabObjects.values() 57 58 DEFAULT_DIRECTORY = "PhabObjectCache" 59 60 def _get_pickle_name(self, directory): 61 file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle" 62 return os.path.join(directory, file_name) 63 64 def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY): 65 """ 66 FIXME: consider if serializing to JSON would bring interoperability 67 advantages over serializing to pickle. 68 """ 69 try: 70 f = open(self._get_pickle_name(directory), "rb") 71 except IOError as err: 72 print("Could not find cache. Error message: {0}. Continuing..." 73 .format(err)) 74 else: 75 with f: 76 try: 77 d = pickle.load(f) 78 self.__dict__.update(d) 79 except EOFError as err: 80 print("Cache seems to be corrupt. " + 81 "Not using cache. Error message: {0}".format(err)) 82 83 def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY): 84 if not os.path.exists(directory): 85 os.makedirs(directory) 86 with open(self._get_pickle_name(directory), "wb") as f: 87 pickle.dump(self.__dict__, f) 88 print("wrote cache to disk, most_recent_info= {0}".format( 89 datetime.fromtimestamp(self.most_recent_info) 90 if self.most_recent_info is not None else None)) 91 92 93class PhabReview(PhabObject): 94 OBJECT_KIND = "Review" 95 96 def __init__(self, id): 97 PhabObject.__init__(self, id) 98 99 def update(self, title, dateCreated, dateModified, author): 100 self.title = title 101 self.dateCreated = dateCreated 102 self.dateModified = dateModified 103 self.author = author 104 105 def setPhabDiffs(self, phabDiffs): 106 self.phabDiffs = phabDiffs 107 108 109class PhabUser(PhabObject): 110 OBJECT_KIND = "User" 111 112 def __init__(self, id): 113 PhabObject.__init__(self, id) 114 115 def update(self, phid, realName): 116 self.phid = phid 117 self.realName = realName 118 119 120class PhabHunk: 121 def __init__(self, rest_api_hunk): 122 self.oldOffset = int(rest_api_hunk["oldOffset"]) 123 self.oldLength = int(rest_api_hunk["oldLength"]) 124 # self.actual_lines_changed_offset will contain the offsets of the 125 # lines that were changed in this hunk. 126 self.actual_lines_changed_offset = [] 127 offset = self.oldOffset 128 inHunk = False 129 hunkStart = -1 130 contextLines = 3 131 for line in rest_api_hunk["corpus"].split("\n"): 132 if line.startswith("+"): 133 # line is a new line that got introduced in this patch. 134 # Do not record it as a changed line. 135 if inHunk is False: 136 inHunk = True 137 hunkStart = max(self.oldOffset, offset - contextLines) 138 continue 139 if line.startswith("-"): 140 # line was changed or removed from the older version of the 141 # code. Record it as a changed line. 142 if inHunk is False: 143 inHunk = True 144 hunkStart = max(self.oldOffset, offset - contextLines) 145 offset += 1 146 continue 147 # line is a context line. 148 if inHunk is True: 149 inHunk = False 150 hunkEnd = offset + contextLines 151 self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) 152 offset += 1 153 if inHunk is True: 154 hunkEnd = offset + contextLines 155 self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) 156 157 # The above algorithm could result in adjacent or overlapping ranges 158 # being recorded into self.actual_lines_changed_offset. 159 # Merge the adjacent and overlapping ranges in there: 160 t = [] 161 lastRange = None 162 for start, end in self.actual_lines_changed_offset + \ 163 [(sys.maxsize, sys.maxsize)]: 164 if lastRange is None: 165 lastRange = (start, end) 166 else: 167 if lastRange[1] >= start: 168 lastRange = (lastRange[0], end) 169 else: 170 t.append(lastRange) 171 lastRange = (start, end) 172 self.actual_lines_changed_offset = t 173 174 175class PhabChange: 176 def __init__(self, rest_api_change): 177 self.oldPath = rest_api_change["oldPath"] 178 self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]] 179 180 181class PhabDiff(PhabObject): 182 OBJECT_KIND = "Diff" 183 184 def __init__(self, id): 185 PhabObject.__init__(self, id) 186 187 def update(self, rest_api_results): 188 self.revisionID = rest_api_results["revisionID"] 189 self.dateModified = int(rest_api_results["dateModified"]) 190 self.dateCreated = int(rest_api_results["dateCreated"]) 191 self.changes = [PhabChange(c) for c in rest_api_results["changes"]] 192 193 194class ReviewsCache(PhabObjectCache): 195 def __init__(self): 196 PhabObjectCache.__init__(self, PhabReview) 197 198 199class UsersCache(PhabObjectCache): 200 def __init__(self): 201 PhabObjectCache.__init__(self, PhabUser) 202 203 204reviews_cache = ReviewsCache() 205users_cache = UsersCache() 206 207 208def init_phab_connection(): 209 phab = Phabricator() 210 phab.update_interfaces() 211 return phab 212 213 214def update_cached_info(phab, cache, phab_query, order, record_results, 215 max_nr_entries_per_fetch, max_nr_days_to_cache): 216 q = phab 217 LIMIT = max_nr_entries_per_fetch 218 for query_step in phab_query: 219 q = getattr(q, query_step) 220 results = q(order=order, limit=LIMIT) 221 most_recent_info, oldest_info = record_results(cache, results, phab) 222 oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \ 223 timedelta(days=max_nr_days_to_cache) 224 most_recent_info_overall = most_recent_info 225 cache.write_cache_to_disk() 226 after = results["cursor"]["after"] 227 print("after: {0!r}".format(after)) 228 print("most_recent_info: {0}".format( 229 datetime.fromtimestamp(most_recent_info))) 230 while (after is not None 231 and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch): 232 need_more_older_data = \ 233 (cache.oldest_info is None or 234 datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch) 235 print(("need_more_older_data={0} cache.oldest_info={1} " + 236 "oldest_info_to_fetch={2}").format( 237 need_more_older_data, 238 datetime.fromtimestamp(cache.oldest_info) 239 if cache.oldest_info is not None else None, 240 oldest_info_to_fetch)) 241 need_more_newer_data = \ 242 (cache.most_recent_info is None or 243 cache.most_recent_info < most_recent_info) 244 print(("need_more_newer_data={0} cache.most_recent_info={1} " + 245 "most_recent_info={2}") 246 .format(need_more_newer_data, cache.most_recent_info, 247 most_recent_info)) 248 if not need_more_older_data and not need_more_newer_data: 249 break 250 results = q(order=order, after=after, limit=LIMIT) 251 most_recent_info, oldest_info = record_results(cache, results, phab) 252 after = results["cursor"]["after"] 253 print("after: {0!r}".format(after)) 254 print("most_recent_info: {0}".format( 255 datetime.fromtimestamp(most_recent_info))) 256 cache.write_cache_to_disk() 257 cache.most_recent_info = most_recent_info_overall 258 if after is None: 259 # We did fetch all records. Mark the cache to contain all info since 260 # the start of time. 261 oldest_info = 0 262 cache.oldest_info = oldest_info 263 cache.write_cache_to_disk() 264 265 266def record_reviews(cache, reviews, phab): 267 most_recent_info = None 268 oldest_info = None 269 for reviewInfo in reviews["data"]: 270 if reviewInfo["type"] != "DREV": 271 continue 272 id = reviewInfo["id"] 273 # phid = reviewInfo["phid"] 274 dateModified = int(reviewInfo["fields"]["dateModified"]) 275 dateCreated = int(reviewInfo["fields"]["dateCreated"]) 276 title = reviewInfo["fields"]["title"] 277 author = reviewInfo["fields"]["authorPHID"] 278 phabReview = cache.get(id) 279 if "dateModified" not in phabReview.__dict__ or \ 280 dateModified > phabReview.dateModified: 281 diff_results = phab.differential.querydiffs(revisionIDs=[id]) 282 diff_ids = sorted(diff_results.keys()) 283 phabDiffs = [] 284 for diff_id in diff_ids: 285 diffInfo = diff_results[diff_id] 286 d = PhabDiff(diff_id) 287 d.update(diffInfo) 288 phabDiffs.append(d) 289 phabReview.update(title, dateCreated, dateModified, author) 290 phabReview.setPhabDiffs(phabDiffs) 291 print("Updated D{0} modified on {1} ({2} diffs)".format( 292 id, datetime.fromtimestamp(dateModified), len(phabDiffs))) 293 294 if most_recent_info is None: 295 most_recent_info = dateModified 296 elif most_recent_info < dateModified: 297 most_recent_info = dateModified 298 299 if oldest_info is None: 300 oldest_info = dateModified 301 elif oldest_info > dateModified: 302 oldest_info = dateModified 303 return most_recent_info, oldest_info 304 305 306def record_users(cache, users, phab): 307 most_recent_info = None 308 oldest_info = None 309 for info in users["data"]: 310 if info["type"] != "USER": 311 continue 312 id = info["id"] 313 phid = info["phid"] 314 dateModified = int(info["fields"]["dateModified"]) 315 # dateCreated = int(info["fields"]["dateCreated"]) 316 realName = info["fields"]["realName"] 317 phabUser = cache.get(id) 318 phabUser.update(phid, realName) 319 if most_recent_info is None: 320 most_recent_info = dateModified 321 elif most_recent_info < dateModified: 322 most_recent_info = dateModified 323 if oldest_info is None: 324 oldest_info = dateModified 325 elif oldest_info > dateModified: 326 oldest_info = dateModified 327 return most_recent_info, oldest_info 328 329 330PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"), 331 "updated", record_reviews, 5, 7), 332 (users_cache, ("user", "search"), "newest", record_users, 333 100, 1000)) 334 335 336def load_cache(): 337 for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO: 338 cache.populate_cache_from_disk() 339 print("Loaded {0} nr entries: {1}".format( 340 cache.get_name(), len(cache.get_ids_in_cache()))) 341 print("Loaded {0} has most recent info: {1}".format( 342 cache.get_name(), 343 datetime.fromtimestamp(cache.most_recent_info) 344 if cache.most_recent_info is not None else None)) 345 346 347def update_cache(phab): 348 load_cache() 349 for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \ 350 max_nr_days_to_cache in PHABCACHESINFO: 351 update_cached_info(phab, cache, phab_query, order, record_results, 352 max_nr_entries_per_fetch, max_nr_days_to_cache) 353 ids_in_cache = cache.get_ids_in_cache() 354 print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name())) 355 cache.write_cache_to_disk() 356 357 358def get_most_recent_reviews(days): 359 newest_reviews = sorted( 360 reviews_cache.get_objects(), key=lambda r: -r.dateModified) 361 if len(newest_reviews) == 0: 362 return newest_reviews 363 most_recent_review_time = \ 364 datetime.fromtimestamp(newest_reviews[0].dateModified) 365 cut_off_date = most_recent_review_time - timedelta(days=days) 366 result = [] 367 for review in newest_reviews: 368 if datetime.fromtimestamp(review.dateModified) < cut_off_date: 369 return result 370 result.append(review) 371 return result 372 373 374# All of the above code is about fetching data from Phabricator and caching it 375# on local disk. The below code contains the actual "business logic" for this 376# script. 377 378_userphid2realname = None 379 380 381def get_real_name_from_author(user_phid): 382 global _userphid2realname 383 if _userphid2realname is None: 384 _userphid2realname = {} 385 for user in users_cache.get_objects(): 386 _userphid2realname[user.phid] = user.realName 387 return _userphid2realname.get(user_phid, "unknown") 388 389 390def print_most_recent_reviews(phab, days, filter_reviewers): 391 msgs = [] 392 393 def add_msg(msg): 394 msgs.append(msg) 395 print(msg) 396 397 newest_reviews = get_most_recent_reviews(days) 398 add_msg(u"These are the reviews that look interesting to be reviewed. " + 399 u"The report below has 2 sections. The first " + 400 u"section is organized per review; the second section is organized " 401 + u"per potential reviewer.\n") 402 oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None 403 oldest_datetime = \ 404 datetime.fromtimestamp(oldest_review.dateModified) \ 405 if oldest_review else None 406 add_msg((u"The report below is based on analyzing the reviews that got " + 407 u"touched in the past {0} days (since {1}). " + 408 u"The script found {2} such reviews.\n").format( 409 days, oldest_datetime, len(newest_reviews))) 410 reviewer2reviews_and_scores = {} 411 for i, review in enumerate(newest_reviews): 412 matched_reviewers = find_reviewers_for_review(review) 413 matched_reviewers = filter_reviewers(matched_reviewers) 414 if len(matched_reviewers) == 0: 415 continue 416 add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" + 417 u" Last updated on {4}").format( 418 i, review.id, 419 get_real_name_from_author(review.author), review.title, 420 datetime.fromtimestamp(review.dateModified))) 421 for reviewer, scores in matched_reviewers: 422 add_msg(u" potential reviewer {0}, score {1}".format( 423 reviewer, 424 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")")) 425 if reviewer not in reviewer2reviews_and_scores: 426 reviewer2reviews_and_scores[reviewer] = [] 427 reviewer2reviews_and_scores[reviewer].append((review, scores)) 428 429 # Print out a summary per reviewer. 430 for reviewer in sorted(reviewer2reviews_and_scores.keys()): 431 reviews_and_scores = reviewer2reviews_and_scores[reviewer] 432 reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True) 433 add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format( 434 reviewer, len(reviews_and_scores))) 435 for review, scores in reviews_and_scores: 436 add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format( 437 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id, 438 review.title, get_real_name_from_author(review.author))) 439 return "\n".join(msgs) 440 441 442def get_git_cmd_output(cmd): 443 output = None 444 try: 445 logging.debug(cmd) 446 output = subprocess.check_output( 447 cmd, shell=True, stderr=subprocess.STDOUT) 448 except subprocess.CalledProcessError as e: 449 logging.debug(str(e)) 450 if output is None: 451 return None 452 return output.decode("utf-8", errors='ignore') 453 454 455reAuthorMail = re.compile("^author-mail <([^>]*)>.*$") 456 457 458def parse_blame_output_line_porcelain(blame_output): 459 email2nr_occurences = {} 460 if blame_output is None: 461 return email2nr_occurences 462 for line in blame_output.split('\n'): 463 m = reAuthorMail.match(line) 464 if m: 465 author_email_address = m.group(1) 466 if author_email_address not in email2nr_occurences: 467 email2nr_occurences[author_email_address] = 1 468 else: 469 email2nr_occurences[author_email_address] += 1 470 return email2nr_occurences 471 472 473def find_reviewers_for_diff_heuristic(diff): 474 # Heuristic 1: assume good reviewers are the ones that touched the same 475 # lines before as this patch is touching. 476 # Heuristic 2: assume good reviewers are the ones that touched the same 477 # files before as this patch is touching. 478 reviewers2nr_lines_touched = {} 479 reviewers2nr_files_touched = {} 480 # Assume last revision before diff was modified is the revision the diff 481 # applies to. 482 git_repo = "git_repos/llvm" 483 cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format( 484 git_repo, 485 datetime.fromtimestamp( 486 diff.dateModified).strftime("%Y-%m-%d %H:%M:%s")) 487 base_revision = get_git_cmd_output(cmd).strip() 488 logging.debug("Base revision={0}".format(base_revision)) 489 for change in diff.changes: 490 path = change.oldPath 491 # Compute heuristic 1: look at context of patch lines. 492 for hunk in change.hunks: 493 for start_line, end_line in hunk.actual_lines_changed_offset: 494 # Collect git blame results for authors in those ranges. 495 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " + 496 "-w --line-porcelain -L {1},{2} {3} -- {4}").format( 497 git_repo, start_line, end_line, base_revision, path) 498 blame_output = get_git_cmd_output(cmd) 499 for reviewer, nr_occurences in \ 500 parse_blame_output_line_porcelain(blame_output).items(): 501 if reviewer not in reviewers2nr_lines_touched: 502 reviewers2nr_lines_touched[reviewer] = 0 503 reviewers2nr_lines_touched[reviewer] += nr_occurences 504 # Compute heuristic 2: don't look at context, just at files touched. 505 # Collect git blame results for authors in those ranges. 506 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " + 507 "--line-porcelain {1} -- {2}").format(git_repo, base_revision, 508 path) 509 blame_output = get_git_cmd_output(cmd) 510 for reviewer, nr_occurences in parse_blame_output_line_porcelain( 511 blame_output).items(): 512 if reviewer not in reviewers2nr_files_touched: 513 reviewers2nr_files_touched[reviewer] = 0 514 reviewers2nr_files_touched[reviewer] += 1 515 516 # Compute "match scores" 517 total_nr_lines = sum(reviewers2nr_lines_touched.values()) 518 total_nr_files = len(diff.changes) 519 reviewers_matchscores = \ 520 [(reviewer, 521 (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines 522 if total_nr_lines != 0 else 0, 523 reviewers2nr_files_touched[reviewer]*100.0/total_nr_files 524 if total_nr_files != 0 else 0)) 525 for reviewer, nr_lines 526 in reviewers2nr_files_touched.items()] 527 reviewers_matchscores.sort(key=lambda i: i[1], reverse=True) 528 return reviewers_matchscores 529 530 531def find_reviewers_for_review(review): 532 # Process the newest diff first. 533 diffs = sorted( 534 review.phabDiffs, key=lambda d: d.dateModified, reverse=True) 535 if len(diffs) == 0: 536 return 537 diff = diffs[0] 538 matched_reviewers = find_reviewers_for_diff_heuristic(diff) 539 # Show progress, as this is a slow operation: 540 sys.stdout.write('.') 541 sys.stdout.flush() 542 logging.debug(u"matched_reviewers: {0}".format(matched_reviewers)) 543 return matched_reviewers 544 545 546def update_git_repos(): 547 git_repos_directory = "git_repos" 548 for name, url in GIT_REPO_METADATA: 549 dirname = os.path.join(git_repos_directory, name) 550 if not os.path.exists(dirname): 551 cmd = "git clone {0} {1}".format(url, dirname) 552 output = get_git_cmd_output(cmd) 553 cmd = "git -C {0} pull --rebase".format(dirname) 554 output = get_git_cmd_output(cmd) 555 556 557def send_emails(email_addresses, sender, msg): 558 s = smtplib.SMTP() 559 s.connect() 560 for email_address in email_addresses: 561 email_msg = email.mime.multipart.MIMEMultipart() 562 email_msg['From'] = sender 563 email_msg['To'] = email_address 564 email_msg['Subject'] = 'LLVM patches you may be able to review.' 565 email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain')) 566 # python 3.x: s.send_message(email_msg) 567 s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string()) 568 s.quit() 569 570 571def filter_reviewers_to_report_for(people_to_look_for): 572 # The below is just an example filter, to only report potential reviews 573 # to do for the people that will receive the report email. 574 return lambda potential_reviewers: [r for r in potential_reviewers 575 if r[0] in people_to_look_for] 576 577 578def main(): 579 parser = argparse.ArgumentParser( 580 description='Match open reviews to potential reviewers.') 581 parser.add_argument( 582 '--no-update-cache', 583 dest='update_cache', 584 action='store_false', 585 default=True, 586 help='Do not update cached Phabricator objects') 587 parser.add_argument( 588 '--email-report', 589 dest='email_report', 590 nargs='*', 591 default="", 592 help="A email addresses to send the report to.") 593 parser.add_argument( 594 '--sender', 595 dest='sender', 596 default="", 597 help="The email address to use in 'From' on messages emailed out.") 598 parser.add_argument( 599 '--email-addresses', 600 dest='email_addresses', 601 nargs='*', 602 help="The email addresses (as known by LLVM git) of " + 603 "the people to look for reviews for.") 604 parser.add_argument('--verbose', '-v', action='count') 605 606 args = parser.parse_args() 607 608 if args.verbose >= 1: 609 logging.basicConfig(level=logging.DEBUG) 610 611 people_to_look_for = [e.decode('utf-8') for e in args.email_addresses] 612 logging.debug("Will look for reviews that following contributors could " + 613 "review: {}".format(people_to_look_for)) 614 logging.debug("Will email a report to: {}".format(args.email_report)) 615 616 phab = init_phab_connection() 617 618 if args.update_cache: 619 update_cache(phab) 620 621 load_cache() 622 update_git_repos() 623 msg = print_most_recent_reviews( 624 phab, 625 days=1, 626 filter_reviewers=filter_reviewers_to_report_for(people_to_look_for)) 627 628 if args.email_report != []: 629 send_emails(args.email_report, args.sender, msg) 630 631 632if __name__ == "__main__": 633 main() 634