1# Copyright (c) 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import base64 6import xml.dom.minidom as minidom 7from xml.parsers.expat import ExpatError 8 9import crash_utils 10from repository_parser_interface import ParserInterface 11 12FILE_CHANGE_TYPE_MAP = { 13 'add': 'A', 14 'copy': 'C', 15 'delete': 'D', 16 'modify': 'M', 17 'rename': 'R' 18} 19 20 21def _ConvertToFileChangeType(file_action): 22 # TODO(stgao): verify impact on code that checks the file change type. 23 return file_action[0].upper() 24 25 26class GitParser(ParserInterface): 27 """Parser for Git repository in googlesource. 28 29 Attributes: 30 parsed_deps: A map from component path to its repository name, regression, 31 etc. 32 url_parts_map: A map from url type to its url parts. This parts are added 33 the base url to form different urls. 34 """ 35 36 def __init__(self, parsed_deps, url_parts_map): 37 self.component_to_url_map = parsed_deps 38 self.url_parts_map = url_parts_map 39 40 def ParseChangelog(self, component_path, range_start, range_end): 41 file_to_revision_map = {} 42 revision_map = {} 43 base_url = self.component_to_url_map[component_path]['repository'] 44 changelog_url = base_url + self.url_parts_map['changelog_url'] 45 revision_url = base_url + self.url_parts_map['revision_url'] 46 47 # Retrieve data from the url, return empty maps if fails. Html url is a\ 48 # url where the changelog can be parsed from html. 49 url = changelog_url % (range_start, range_end) 50 html_url = url + '?pretty=fuller' 51 response = crash_utils.GetDataFromURL(html_url) 52 if not response: 53 return (revision_map, file_to_revision_map) 54 55 # Parse xml out of the returned string. If it failes, Try parsing 56 # from JSON objects. 57 try: 58 dom = minidom.parseString(response) 59 except ExpatError: 60 self.ParseChangelogFromJSON(range_start, range_end, changelog_url, 61 revision_url, revision_map, 62 file_to_revision_map) 63 return (revision_map, file_to_revision_map) 64 65 # The revisions information are in from the third divs to the second 66 # to last one. 67 divs = dom.getElementsByTagName('div')[2:-1] 68 pres = dom.getElementsByTagName('pre') 69 uls = dom.getElementsByTagName('ul') 70 71 # Divs, pres and uls each contain revision information for one CL, so 72 # they should have same length. 73 if not divs or len(divs) != len(pres) or len(pres) != len(uls): 74 self.ParseChangelogFromJSON(range_start, range_end, changelog_url, 75 revision_url, revision_map, 76 file_to_revision_map) 77 return (revision_map, file_to_revision_map) 78 79 # Iterate through divs and parse revisions 80 for (div, pre, ul) in zip(divs, pres, uls): 81 # Create new revision object for each revision. 82 revision = {} 83 84 # There must be three <tr>s. If not, this page is wrong. 85 trs = div.getElementsByTagName('tr') 86 if len(trs) != 3: 87 continue 88 89 # Retrieve git hash. 90 githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue 91 92 # Retrieve and set author. 93 author = trs[1].getElementsByTagName( 94 'td')[0].firstChild.nodeValue.split('<')[0] 95 revision['author'] = author 96 97 # Retrive and set message. 98 revision['message'] = pre.firstChild.nodeValue 99 100 # Set url of this CL. 101 revision_url_part = self.url_parts_map['revision_url'] % githash 102 revision['url'] = base_url + revision_url_part 103 104 # Go through changed files, they are in li. 105 lis = ul.getElementsByTagName('li') 106 for li in lis: 107 # Retrieve path and action of the changed file 108 file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue 109 file_change_type = li.getElementsByTagName('span')[ 110 0].getAttribute('class') 111 112 # Normalize file action so that it is same as SVN parser. 113 file_change_type = _ConvertToFileChangeType(file_change_type) 114 115 # Add the changed file to the map. 116 if file_path not in file_to_revision_map: 117 file_to_revision_map[file_path] = [] 118 file_to_revision_map[file_path].append((githash, file_change_type)) 119 120 # Add this revision object to the map. 121 revision_map[githash] = revision 122 123 # Parse one revision for the start range, because googlesource does not 124 # include the start of the range. 125 self.ParseRevision(revision_url, range_start, revision_map, 126 file_to_revision_map) 127 128 return (revision_map, file_to_revision_map) 129 130 def ParseChangelogFromJSON(self, range_start, range_end, changelog_url, 131 revision_url, revision_map, file_to_revision_map): 132 """Parses changelog by going over the JSON file. 133 134 Args: 135 range_start: Starting range of the regression. 136 range_end: Ending range of the regression. 137 changelog_url: The url to retrieve changelog from. 138 revision_url: The url to retrieve individual revision from. 139 revision_map: A map from a git hash number to its revision information. 140 file_to_revision_map: A map from file to a git hash in which it occurs. 141 """ 142 # Compute URLs from given range, and retrieves changelog. Stop if it fails. 143 changelog_url %= (range_start, range_end) 144 json_url = changelog_url + '?format=json' 145 response = crash_utils.GetDataFromURL(json_url) 146 if not response: 147 return 148 149 # Parse changelog from the returned object. The returned string should 150 # start with ")}]'\n", so start from the 6th character. 151 revisions = crash_utils.LoadJSON(response[5:]) 152 if not revisions: 153 return 154 155 # Parse individual revision in the log. 156 for revision in revisions['log']: 157 githash = revision['commit'] 158 self.ParseRevision(revision_url, githash, revision_map, 159 file_to_revision_map) 160 161 # Parse the revision with range_start, because googlesource ignores 162 # that one. 163 self.ParseRevision(revision_url, range_start, revision_map, 164 file_to_revision_map) 165 166 def ParseRevision(self, revision_url, githash, revision_map, 167 file_to_revision_map): 168 169 # Retrieve data from the URL, return if it fails. 170 url = revision_url % githash 171 response = crash_utils.GetDataFromURL(url + '?format=json') 172 if not response: 173 return 174 175 # Load JSON object from the string. If it fails, terminate the function. 176 json_revision = crash_utils.LoadJSON(response[5:]) 177 if not json_revision: 178 return 179 180 # Create a map representing object and get githash from the JSON object. 181 revision = {} 182 githash = json_revision['commit'] 183 184 # Set author, message and URL of this CL. 185 revision['author'] = json_revision['author']['name'] 186 revision['message'] = json_revision['message'] 187 revision['url'] = url 188 189 # Iterate through the changed files. 190 for diff in json_revision['tree_diff']: 191 file_path = diff['new_path'] 192 file_change_type = diff['type'] 193 194 # Normalize file action so that it fits with svn_repository_parser. 195 file_change_type = _ConvertToFileChangeType(file_change_type) 196 197 # Add the file to the map. 198 if file_path not in file_to_revision_map: 199 file_to_revision_map[file_path] = [] 200 file_to_revision_map[file_path].append((githash, file_change_type)) 201 202 # Add this CL to the map. 203 revision_map[githash] = revision 204 205 return 206 207 def ParseLineDiff(self, path, component, file_change_type, githash): 208 changed_line_numbers = [] 209 changed_line_contents = [] 210 base_url = self.component_to_url_map[component]['repository'] 211 backup_url = (base_url + self.url_parts_map['revision_url']) % githash 212 213 # If the file is added (not modified), treat it as if it is not changed. 214 if file_change_type in ('A', 'C', 'R'): 215 # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy? 216 return (backup_url, changed_line_numbers, changed_line_contents) 217 218 # Retrieves the diff data from URL, and if it fails, return emptry lines. 219 url = (base_url + self.url_parts_map['diff_url']) % (githash, path) 220 data = crash_utils.GetDataFromURL(url + '?format=text') 221 if not data: 222 return (backup_url, changed_line_numbers, changed_line_contents) 223 224 # Decode the returned object to line diff info 225 diff = base64.b64decode(data).splitlines() 226 227 # Iterate through the lines in diff. Set current line to -1 so that we know 228 # that current line is part of the diff chunk. 229 current_line = -1 230 for line in diff: 231 line = line.strip() 232 233 # If line starts with @@, a new chunk starts. 234 if line.startswith('@@'): 235 current_line = int(line.split('+')[1].split(',')[0]) 236 237 # If we are in a chunk. 238 elif current_line != -1: 239 # If line is either added or modified. 240 if line.startswith('+'): 241 changed_line_numbers.append(current_line) 242 changed_line_contents.append(line[2:]) 243 244 # Do not increment current line if the change is 'delete'. 245 if not line.startswith('-'): 246 current_line += 1 247 248 # Return url without '?format=json' 249 return (url, changed_line_numbers, changed_line_contents) 250 251 def ParseBlameInfo(self, component, file_path, line, revision): 252 base_url = self.component_to_url_map[component]['repository'] 253 254 # Retrieve blame JSON file from googlesource. If it fails, return None. 255 url_part = self.url_parts_map['blame_url'] % (revision, file_path) 256 blame_url = base_url + url_part 257 json_string = crash_utils.GetDataFromURL(blame_url) 258 if not json_string: 259 return 260 261 # Parse JSON object from the string. The returned string should 262 # start with ")}]'\n", so start from the 6th character. 263 annotation = crash_utils.LoadJSON(json_string[5:]) 264 if not annotation: 265 return 266 267 # Go through the regions, which is a list of consecutive lines with same 268 # author/revision. 269 for blame_line in annotation['regions']: 270 start = blame_line['start'] 271 count = blame_line['count'] 272 273 # For each region, check if the line we want the blame info of is in this 274 # region. 275 if start <= line and line <= start + count - 1: 276 # If we are in the right region, get the information from the line. 277 revision = blame_line['commit'] 278 author = blame_line['author']['name'] 279 revision_url_parts = self.url_parts_map['revision_url'] % revision 280 revision_url = base_url + revision_url_parts 281 # TODO(jeun): Add a way to get content from JSON object. 282 content = None 283 284 (revision_info, _) = self.ParseChangelog(component, revision, revision) 285 message = revision_info[revision]['message'] 286 return (content, revision, author, revision_url, message) 287 288 # Return none if the region does not exist. 289 return None 290