1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from collections import defaultdict, deque, namedtuple 6from HTMLParser import HTMLParser, HTMLParseError 7from itertools import groupby 8from operator import itemgetter 9import posixpath 10from urlparse import urlsplit 11 12from file_system_util import CreateURLsFromPaths 13 14 15Page = namedtuple('Page', 'status, links, anchors, anchor_refs') 16 17 18def _SplitAnchor(url): 19 components = urlsplit(url) 20 return components.path, components.fragment 21 22 23def _Process(path, renderer): 24 '''Render the page at |path| using a |renderer| and process the contents of 25 that page. Returns a |Page| namedtuple with fields for the http status code 26 of the page render, the href of all the links that occurred on the page, all 27 of the anchors on the page (ids and names), and all links that contain an 28 anchor component. 29 30 If a non-html page is properly rendered, a |Page| with status code 200 and 31 all other fields empty is returned. 32 ''' 33 parser = _ContentParser() 34 response = renderer(path) 35 36 if response.status != 200: 37 return Page(response.status, (), (), ()) 38 if not path.endswith('.html'): 39 return Page(200, (), (), ()) 40 41 try: 42 parser.feed(str(response.content)) 43 except HTMLParseError: 44 return Page(200, (), (), ()) 45 46 links, anchors = parser.links, parser.anchors 47 if '/' in path: 48 base, _ = path.rsplit('/', 1) 49 else: 50 base = '' 51 edges = [] 52 anchor_refs = [] 53 54 # Convert relative links to absolute links and categorize links as edges 55 # or anchor_refs. 56 for link in links: 57 # Files like experimental_history.html are refered to with the URL 58 # experimental.history.html. 59 head, last = link.rsplit('/', 1) if '/' in link else ('', link) 60 last, anchor = _SplitAnchor(last) 61 62 if last.endswith('.html') and last.count('.') > 1: 63 last = last.replace('.', '_', last.count('.') - 1) 64 link = posixpath.join(head, last) 65 if anchor: 66 link = '%s#%s' % (link, anchor) 67 68 if link.startswith('#'): 69 anchor_refs.append(link) 70 else: 71 if link.startswith('/'): 72 link = link[1:] 73 else: 74 link = posixpath.normpath('%s/%s' % (base, link)) 75 76 if '#' in link: 77 anchor_refs.append(link) 78 else: 79 edges.append(link) 80 81 return Page(200, edges, anchors, anchor_refs) 82 83 84class _ContentParser(HTMLParser): 85 '''Parse an html file pulling out all links and anchor_refs, where an 86 anchor_ref is a link that contains an anchor. 87 ''' 88 89 def __init__(self): 90 HTMLParser.__init__(self) 91 self.links = [] 92 self.anchors = set() 93 94 def handle_starttag(self, tag, raw_attrs): 95 attrs = dict(raw_attrs) 96 97 if tag == 'a': 98 # Handle special cases for href's that: start with a space, contain 99 # just a '.' (period), contain python templating code, are an absolute 100 # url, are a zip file, or execute javascript on the page. 101 href = attrs.get('href', '').strip() 102 if href and not href == '.' and not '{{' in href: 103 if not urlsplit(href).scheme in ('http', 'https'): 104 if not href.endswith('.zip') and not 'javascript:' in href: 105 self.links.append(href) 106 107 if attrs.get('id'): 108 self.anchors.add(attrs['id']) 109 if attrs.get('name'): 110 self.anchors.add(attrs['name']) 111 112 113class LinkErrorDetector(object): 114 '''Finds link errors on the doc server. This includes broken links, those with 115 a target page that 404s or contain an anchor that doesn't exist, or pages that 116 have no links to them. 117 ''' 118 119 def __init__(self, file_system, renderer, public_path, root_pages): 120 '''Creates a new broken link detector. |renderer| is a callable that takes 121 a path and returns a full html page. |public_path| is the path to public 122 template files. All URLs in |root_pages| are used as the starting nodes for 123 the orphaned page search. 124 ''' 125 self._file_system = file_system 126 self._renderer = renderer 127 self._public_path = public_path 128 self._pages = defaultdict(lambda: Page(404, (), (), ())) 129 self._root_pages = frozenset(root_pages) 130 self._always_detached = frozenset(( 131 'apps/404.html', 132 'extensions/404.html', 133 'apps/private_apis.html', 134 'extensions/private_apis.html')) 135 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) 136 137 self._RenderAllPages() 138 139 def _RenderAllPages(self): 140 '''Traverses the public templates directory rendering each URL and 141 processing the resultant html to pull out all links and anchors. 142 ''' 143 top_level_directories = ( 144 ('docs/templates/public', ''), 145 ('docs/static', 'static/'), 146 ('docs/examples', 'extensions/examples/'), 147 ) 148 149 for dirpath, urlprefix in top_level_directories: 150 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) 151 for url, path in files: 152 self._pages[url] = _Process(url, self._renderer) 153 154 if self._pages[url].status != 200: 155 print(url, ', a url derived from the path', dirpath + 156 ', resulted in a', self._pages[url].status) 157 158 def _FollowRedirections(self, starting_url, limit=4): 159 '''Follow redirection until a non-redirectable page is reached. Start at 160 |starting_url| which must return a 301 or 302 status code. 161 162 Return a tuple of: the status of rendering |staring_url|, the final url, 163 and a list of the pages reached including |starting_url|. If no redirection 164 occurred, returns (None, None, None). 165 ''' 166 pages_reached = [starting_url] 167 redirect_link = None 168 target_page = self._renderer(starting_url) 169 original_status = status = target_page.status 170 count = 0 171 172 while status in (301, 302): 173 if count > limit: 174 return None, None, None 175 redirect_link = target_page.headers.get('Location') 176 target_page = self._renderer(redirect_link) 177 status = target_page.status 178 pages_reached.append(redirect_link) 179 count += 1 180 181 if redirect_link is None: 182 return None, None, None 183 184 return original_status, redirect_link, pages_reached 185 186 def _CategorizeBrokenLinks(self, url, page, pages): 187 '''Find all broken links on a page and create appropriate notes describing 188 why tehy are broken (broken anchor, target redirects, etc). |page| is the 189 current page being checked and is the result of rendering |url|. |pages| 190 is a callable that takes a path and returns a Page. 191 ''' 192 broken_links = [] 193 194 for link in page.links + page.anchor_refs: 195 components = urlsplit(link) 196 fragment = components.fragment 197 198 if components.path == '': 199 if fragment == 'top' or fragment == '': 200 continue 201 if not fragment in page.anchors: 202 broken_links.append((200, url, link, 'target anchor not found')) 203 else: 204 # Render the target page 205 target_page = pages(components.path) 206 207 if target_page.status != 200: 208 if components.path in self._redirection_whitelist: 209 continue 210 211 status, relink, _ = self._FollowRedirections(components.path) 212 if relink: 213 broken_links.append(( 214 status, 215 url, 216 link, 217 'redirects to %s' % relink)) 218 else: 219 broken_links.append(( 220 target_page.status, url, link, 'target page not found')) 221 222 elif fragment: 223 if not fragment in target_page.anchors: 224 broken_links.append(( 225 target_page.status, url, link, 'target anchor not found')) 226 227 return broken_links 228 229 def GetBrokenLinks(self): 230 '''Find all broken links. A broken link is a link that leads to a page 231 that does not exist (404s), redirects to another page (301 or 302), or 232 has an anchor whose target does not exist. 233 234 Returns a list of tuples of four elements: status, url, target_page, 235 notes. 236 ''' 237 broken_links = [] 238 239 for url in self._pages.keys(): 240 page = self._pages[url] 241 if page.status != 200: 242 continue 243 broken_links.extend(self._CategorizeBrokenLinks( 244 url, page, lambda x: self._pages[x])) 245 246 return broken_links 247 248 def GetOrphanedPages(self): 249 '''Crawls the server find all pages that are connected to the pages at 250 |seed_url|s. Return the links that are valid on the server but are not in 251 part of the connected component containing the |root_pages|. These pages 252 are orphans and cannot be reached simply by clicking through the server. 253 ''' 254 pages_to_check = deque(self._root_pages.union(self._always_detached)) 255 found = set(self._root_pages) | self._always_detached 256 257 while pages_to_check: 258 item = pages_to_check.popleft() 259 target_page = self._pages[item] 260 261 if target_page.status != 200: 262 redirected_page = self._FollowRedirections(item)[1] 263 if not redirected_page is None: 264 target_page = self._pages[redirected_page] 265 266 for link in target_page.links: 267 if link not in found: 268 found.add(link) 269 pages_to_check.append(link) 270 271 all_urls = set( 272 [url for url, page in self._pages.iteritems() if page.status == 200]) 273 274 return [url for url in all_urls - found if url.endswith('.html')] 275 276 277def StringifyBrokenLinks(broken_links): 278 '''Prints out broken links in a more readable format. 279 ''' 280 def fixed_width(string, width): 281 return "%s%s" % (string, (width - len(string)) * ' ') 282 283 first_col_width = max(len(link[1]) for link in broken_links) 284 second_col_width = max(len(link[2]) for link in broken_links) 285 target = itemgetter(2) 286 output = [] 287 288 def pretty_print(link, col_offset=0): 289 return "%s -> %s %s" % ( 290 fixed_width(link[1], first_col_width - col_offset), 291 fixed_width(link[2], second_col_width), 292 link[3]) 293 294 for target, links in groupby(sorted(broken_links, key=target), target): 295 links = list(links) 296 # Compress messages 297 if len(links) > 50 and not links[0][2].startswith('#'): 298 message = "Found %d broken links (" % len(links) 299 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) 300 else: 301 for link in links: 302 output.append(pretty_print(link)) 303 304 return '\n'.join(output) 305