1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from collections import defaultdict, deque, namedtuple 6from HTMLParser import HTMLParser, HTMLParseError 7from itertools import groupby 8from operator import itemgetter 9import posixpath 10from urlparse import urlsplit 11 12from file_system_util import CreateURLsFromPaths 13from path_util import AssertIsDirectory 14 15 16Page = namedtuple('Page', 'status, links, anchors, anchor_refs') 17 18 19def _SplitAnchor(url): 20 components = urlsplit(url) 21 return components.path, components.fragment 22 23 24def _Process(path, renderer): 25 '''Render the page at |path| using a |renderer| and process the contents of 26 that page. Returns a |Page| namedtuple with fields for the http status code 27 of the page render, the href of all the links that occurred on the page, all 28 of the anchors on the page (ids and names), and all links that contain an 29 anchor component. 30 31 If a non-html page is properly rendered, a |Page| with status code 200 and 32 all other fields empty is returned. 33 ''' 34 parser = _ContentParser() 35 response = renderer(path) 36 37 if response.status != 200: 38 return Page(response.status, (), (), ()) 39 if not path.endswith('.html'): 40 return Page(200, (), (), ()) 41 42 try: 43 parser.feed(str(response.content)) 44 except HTMLParseError: 45 return Page(200, (), (), ()) 46 47 links, anchors = parser.links, parser.anchors 48 if '/' in path: 49 base, _ = path.rsplit('/', 1) 50 else: 51 base = '' 52 edges = [] 53 anchor_refs = [] 54 55 # Convert relative links to absolute links and categorize links as edges 56 # or anchor_refs. 57 for link in links: 58 # Files like experimental_history.html are refered to with the URL 59 # experimental.history.html. 60 head, last = link.rsplit('/', 1) if '/' in link else ('', link) 61 last, anchor = _SplitAnchor(last) 62 63 if last.endswith('.html') and last.count('.') > 1: 64 last = last.replace('.', '_', last.count('.') - 1) 65 link = posixpath.join(head, last) 66 if anchor: 67 link = '%s#%s' % (link, anchor) 68 69 if link.startswith('#'): 70 anchor_refs.append(link) 71 else: 72 if link.startswith('/'): 73 link = link[1:] 74 else: 75 link = posixpath.normpath('%s/%s' % (base, link)) 76 77 if '#' in link: 78 anchor_refs.append(link) 79 else: 80 edges.append(link) 81 82 return Page(200, edges, anchors, anchor_refs) 83 84 85class _ContentParser(HTMLParser): 86 '''Parse an html file pulling out all links and anchor_refs, where an 87 anchor_ref is a link that contains an anchor. 88 ''' 89 90 def __init__(self): 91 HTMLParser.__init__(self) 92 self.links = [] 93 self.anchors = set() 94 95 def handle_starttag(self, tag, raw_attrs): 96 attrs = dict(raw_attrs) 97 98 if tag == 'a': 99 # Handle special cases for href's that: start with a space, contain 100 # just a '.' (period), contain python templating code, are an absolute 101 # url, are a zip file, or execute javascript on the page. 102 href = attrs.get('href', '').strip() 103 if href and not href == '.' and not '{{' in href: 104 if not urlsplit(href).scheme in ('http', 'https'): 105 if not href.endswith('.zip') and not 'javascript:' in href: 106 self.links.append(href) 107 108 if attrs.get('id'): 109 self.anchors.add(attrs['id']) 110 if attrs.get('name'): 111 self.anchors.add(attrs['name']) 112 113 114class LinkErrorDetector(object): 115 '''Finds link errors on the doc server. This includes broken links, those with 116 a target page that 404s or contain an anchor that doesn't exist, or pages that 117 have no links to them. 118 ''' 119 120 def __init__(self, file_system, renderer, public_path, root_pages): 121 '''Creates a new broken link detector. |renderer| is a callable that takes 122 a path and returns a full html page. |public_path| is the path to public 123 template files. All URLs in |root_pages| are used as the starting nodes for 124 the orphaned page search. 125 ''' 126 AssertIsDirectory(public_path) 127 self._file_system = file_system 128 self._renderer = renderer 129 self._public_path = public_path 130 self._pages = defaultdict(lambda: Page(404, (), (), ())) 131 self._root_pages = frozenset(root_pages) 132 self._always_detached = frozenset(( 133 'apps/404.html', 134 'extensions/404.html', 135 'apps/private_apis.html', 136 'extensions/private_apis.html')) 137 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) 138 139 self._RenderAllPages() 140 141 def _RenderAllPages(self): 142 '''Traverses the public templates directory rendering each URL and 143 processing the resultant html to pull out all links and anchors. 144 ''' 145 top_level_directories = ( 146 ('docs/templates/public/', ''), 147 ('docs/static/', 'static/'), 148 ('docs/examples/', 'extensions/examples/'), 149 ) 150 151 for dirpath, urlprefix in top_level_directories: 152 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) 153 for url, path in files: 154 self._pages[url] = _Process(url, self._renderer) 155 156 if self._pages[url].status != 200: 157 print(url, ', a url derived from the path', dirpath + 158 ', resulted in a', self._pages[url].status) 159 160 def _FollowRedirections(self, starting_url, limit=4): 161 '''Follow redirection until a non-redirectable page is reached. Start at 162 |starting_url| which must return a 301 or 302 status code. 163 164 Return a tuple of: the status of rendering |staring_url|, the final url, 165 and a list of the pages reached including |starting_url|. If no redirection 166 occurred, returns (None, None, None). 167 ''' 168 pages_reached = [starting_url] 169 redirect_link = None 170 target_page = self._renderer(starting_url) 171 original_status = status = target_page.status 172 count = 0 173 174 while status in (301, 302): 175 if count > limit: 176 return None, None, None 177 redirect_link = target_page.headers.get('Location') 178 target_page = self._renderer(redirect_link) 179 status = target_page.status 180 pages_reached.append(redirect_link) 181 count += 1 182 183 if redirect_link is None: 184 return None, None, None 185 186 return original_status, redirect_link, pages_reached 187 188 def _CategorizeBrokenLinks(self, url, page, pages): 189 '''Find all broken links on a page and create appropriate notes describing 190 why tehy are broken (broken anchor, target redirects, etc). |page| is the 191 current page being checked and is the result of rendering |url|. |pages| 192 is a callable that takes a path and returns a Page. 193 ''' 194 broken_links = [] 195 196 for link in page.links + page.anchor_refs: 197 components = urlsplit(link) 198 fragment = components.fragment 199 200 if components.path == '': 201 if fragment == 'top' or fragment == '': 202 continue 203 if not fragment in page.anchors: 204 broken_links.append((200, url, link, 'target anchor not found')) 205 else: 206 # Render the target page 207 target_page = pages(components.path) 208 209 if target_page.status != 200: 210 if components.path in self._redirection_whitelist: 211 continue 212 213 status, relink, _ = self._FollowRedirections(components.path) 214 if relink: 215 broken_links.append(( 216 status, 217 url, 218 link, 219 'redirects to %s' % relink)) 220 else: 221 broken_links.append(( 222 target_page.status, url, link, 'target page not found')) 223 224 elif fragment: 225 if not fragment in target_page.anchors: 226 broken_links.append(( 227 target_page.status, url, link, 'target anchor not found')) 228 229 return broken_links 230 231 def GetBrokenLinks(self): 232 '''Find all broken links. A broken link is a link that leads to a page 233 that does not exist (404s), redirects to another page (301 or 302), or 234 has an anchor whose target does not exist. 235 236 Returns a list of tuples of four elements: status, url, target_page, 237 notes. 238 ''' 239 broken_links = [] 240 241 for url in self._pages.keys(): 242 page = self._pages[url] 243 if page.status != 200: 244 continue 245 broken_links.extend(self._CategorizeBrokenLinks( 246 url, page, lambda x: self._pages[x])) 247 248 return broken_links 249 250 def GetOrphanedPages(self): 251 '''Crawls the server find all pages that are connected to the pages at 252 |seed_url|s. Return the links that are valid on the server but are not in 253 part of the connected component containing the |root_pages|. These pages 254 are orphans and cannot be reached simply by clicking through the server. 255 ''' 256 pages_to_check = deque(self._root_pages.union(self._always_detached)) 257 found = set(self._root_pages) | self._always_detached 258 259 while pages_to_check: 260 item = pages_to_check.popleft() 261 target_page = self._pages[item] 262 263 if target_page.status != 200: 264 redirected_page = self._FollowRedirections(item)[1] 265 if not redirected_page is None: 266 target_page = self._pages[redirected_page] 267 268 for link in target_page.links: 269 if link not in found: 270 found.add(link) 271 pages_to_check.append(link) 272 273 all_urls = set( 274 [url for url, page in self._pages.iteritems() if page.status == 200]) 275 276 return [url for url in all_urls - found if url.endswith('.html')] 277 278 279def StringifyBrokenLinks(broken_links): 280 '''Prints out broken links in a more readable format. 281 ''' 282 def fixed_width(string, width): 283 return "%s%s" % (string, (width - len(string)) * ' ') 284 285 first_col_width = max(len(link[1]) for link in broken_links) 286 second_col_width = max(len(link[2]) for link in broken_links) 287 target = itemgetter(2) 288 output = [] 289 290 def pretty_print(link, col_offset=0): 291 return "%s -> %s %s" % ( 292 fixed_width(link[1], first_col_width - col_offset), 293 fixed_width(link[2], second_col_width), 294 link[3]) 295 296 for target, links in groupby(sorted(broken_links, key=target), target): 297 links = list(links) 298 # Compress messages 299 if len(links) > 50 and not links[0][2].startswith('#'): 300 message = "Found %d broken links (" % len(links) 301 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) 302 else: 303 for link in links: 304 output.append(pretty_print(link)) 305 306 return '\n'.join(output) 307