• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from collections import defaultdict, deque, namedtuple
6from HTMLParser import HTMLParser, HTMLParseError
7from itertools import groupby
8from operator import itemgetter
9import posixpath
10from urlparse import urlsplit
11
12from file_system_util import CreateURLsFromPaths
13
14
15Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
16
17
18def _SplitAnchor(url):
19  components = urlsplit(url)
20  return components.path, components.fragment
21
22
23def _Process(path, renderer):
24  '''Render the page at |path| using a |renderer| and process the contents of
25  that page. Returns a |Page| namedtuple with fields for the http status code
26  of the page render, the href of all the links that occurred on the page, all
27  of the anchors on the page (ids and names), and all links that contain an
28  anchor component.
29
30  If a non-html page is properly rendered, a |Page| with status code 200 and
31  all other fields empty is returned.
32  '''
33  parser = _ContentParser()
34  response = renderer(path)
35
36  if response.status != 200:
37    return Page(response.status, (), (), ())
38  if not path.endswith('.html'):
39    return Page(200, (), (), ())
40
41  try:
42    parser.feed(str(response.content))
43  except HTMLParseError:
44    return Page(200, (), (), ())
45
46  links, anchors = parser.links, parser.anchors
47  if '/' in path:
48    base, _ = path.rsplit('/', 1)
49  else:
50    base = ''
51  edges = []
52  anchor_refs = []
53
54  # Convert relative links to absolute links and categorize links as edges
55  # or anchor_refs.
56  for link in links:
57    # Files like experimental_history.html are refered to with the URL
58    # experimental.history.html.
59    head, last = link.rsplit('/', 1) if '/' in link else ('', link)
60    last, anchor = _SplitAnchor(last)
61
62    if last.endswith('.html') and last.count('.') > 1:
63      last = last.replace('.', '_', last.count('.') - 1)
64      link = posixpath.join(head, last)
65      if anchor:
66        link = '%s#%s' % (link, anchor)
67
68    if link.startswith('#'):
69      anchor_refs.append(link)
70    else:
71      if link.startswith('/'):
72        link = link[1:]
73      else:
74        link = posixpath.normpath('%s/%s' % (base, link))
75
76      if '#' in link:
77        anchor_refs.append(link)
78      else:
79        edges.append(link)
80
81  return Page(200, edges, anchors, anchor_refs)
82
83
84class _ContentParser(HTMLParser):
85  '''Parse an html file pulling out all links and anchor_refs, where an
86  anchor_ref is a link that contains an anchor.
87  '''
88
89  def __init__(self):
90    HTMLParser.__init__(self)
91    self.links = []
92    self.anchors = set()
93
94  def handle_starttag(self, tag, raw_attrs):
95    attrs = dict(raw_attrs)
96
97    if tag == 'a':
98      # Handle special cases for href's that: start with a space, contain
99      # just a '.' (period), contain python templating code, are an absolute
100      # url, are a zip file, or execute javascript on the page.
101      href = attrs.get('href', '').strip()
102      if href and not href == '.' and not '{{' in href:
103        if not urlsplit(href).scheme in ('http', 'https'):
104          if not href.endswith('.zip') and not 'javascript:' in href:
105            self.links.append(href)
106
107    if attrs.get('id'):
108      self.anchors.add(attrs['id'])
109    if attrs.get('name'):
110      self.anchors.add(attrs['name'])
111
112
113class LinkErrorDetector(object):
114  '''Finds link errors on the doc server. This includes broken links, those with
115  a target page that 404s or contain an anchor that doesn't exist, or pages that
116  have no links to them.
117  '''
118
119  def __init__(self, file_system, renderer, public_path, root_pages):
120    '''Creates a new broken link detector. |renderer| is a callable that takes
121    a path and returns a full html page. |public_path| is the path to public
122    template files. All URLs in |root_pages| are used as the starting nodes for
123    the orphaned page search.
124    '''
125    self._file_system = file_system
126    self._renderer = renderer
127    self._public_path = public_path
128    self._pages = defaultdict(lambda: Page(404, (), (), ()))
129    self._root_pages = frozenset(root_pages)
130    self._always_detached = frozenset((
131        'apps/404.html',
132        'extensions/404.html',
133        'apps/private_apis.html',
134        'extensions/private_apis.html'))
135    self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
136
137    self._RenderAllPages()
138
139  def _RenderAllPages(self):
140    '''Traverses the public templates directory rendering each URL and
141    processing the resultant html to pull out all links and anchors.
142    '''
143    top_level_directories = (
144      ('docs/templates/public', ''),
145      ('docs/static', 'static/'),
146      ('docs/examples', 'extensions/examples/'),
147    )
148
149    for dirpath, urlprefix in top_level_directories:
150      files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
151      for url, path in files:
152        self._pages[url] = _Process(url, self._renderer)
153
154        if self._pages[url].status != 200:
155          print(url, ', a url derived from the path', dirpath +
156              ', resulted in a', self._pages[url].status)
157
158  def _FollowRedirections(self, starting_url, limit=4):
159    '''Follow redirection until a non-redirectable page is reached. Start at
160    |starting_url| which must return a 301 or 302 status code.
161
162    Return a tuple of: the status of rendering |staring_url|, the final url,
163    and a list of the pages reached including |starting_url|. If no redirection
164    occurred, returns (None, None, None).
165    '''
166    pages_reached = [starting_url]
167    redirect_link = None
168    target_page = self._renderer(starting_url)
169    original_status = status = target_page.status
170    count = 0
171
172    while status in (301, 302):
173      if count > limit:
174        return None, None, None
175      redirect_link = target_page.headers.get('Location')
176      target_page = self._renderer(redirect_link)
177      status = target_page.status
178      pages_reached.append(redirect_link)
179      count += 1
180
181    if redirect_link is None:
182      return None, None, None
183
184    return original_status, redirect_link, pages_reached
185
186  def _CategorizeBrokenLinks(self, url, page, pages):
187    '''Find all broken links on a page and create appropriate notes describing
188    why tehy are broken (broken anchor, target redirects, etc). |page| is the
189    current page being checked and is the result of rendering |url|. |pages|
190    is a callable that takes a path and returns a Page.
191    '''
192    broken_links = []
193
194    for link in page.links + page.anchor_refs:
195      components = urlsplit(link)
196      fragment = components.fragment
197
198      if components.path == '':
199        if fragment == 'top' or fragment == '':
200          continue
201        if not fragment in page.anchors:
202          broken_links.append((200, url, link, 'target anchor not found'))
203      else:
204        # Render the target page
205        target_page = pages(components.path)
206
207        if target_page.status != 200:
208          if components.path in self._redirection_whitelist:
209            continue
210
211          status, relink, _ = self._FollowRedirections(components.path)
212          if relink:
213            broken_links.append((
214                status,
215                url,
216                link,
217                'redirects to %s' % relink))
218          else:
219            broken_links.append((
220                target_page.status, url, link, 'target page not found'))
221
222        elif fragment:
223          if not fragment in target_page.anchors:
224            broken_links.append((
225                target_page.status, url, link, 'target anchor not found'))
226
227    return broken_links
228
229  def GetBrokenLinks(self):
230    '''Find all broken links. A broken link is a link that leads to a page
231    that does not exist (404s), redirects to another page (301 or 302), or
232    has an anchor whose target does not exist.
233
234    Returns a list of tuples of four elements: status, url, target_page,
235    notes.
236    '''
237    broken_links = []
238
239    for url in self._pages.keys():
240      page = self._pages[url]
241      if page.status != 200:
242        continue
243      broken_links.extend(self._CategorizeBrokenLinks(
244          url, page, lambda x: self._pages[x]))
245
246    return broken_links
247
248  def GetOrphanedPages(self):
249    '''Crawls the server find all pages that are connected to the pages at
250    |seed_url|s. Return the links that are valid on the server but are not in
251    part of the connected component containing the |root_pages|. These pages
252    are orphans and cannot be reached simply by clicking through the server.
253    '''
254    pages_to_check = deque(self._root_pages.union(self._always_detached))
255    found = set(self._root_pages) | self._always_detached
256
257    while pages_to_check:
258      item = pages_to_check.popleft()
259      target_page = self._pages[item]
260
261      if target_page.status != 200:
262        redirected_page = self._FollowRedirections(item)[1]
263        if not redirected_page is None:
264          target_page = self._pages[redirected_page]
265
266      for link in target_page.links:
267        if link not in found:
268          found.add(link)
269          pages_to_check.append(link)
270
271    all_urls = set(
272        [url for url, page in self._pages.iteritems() if page.status == 200])
273
274    return [url for url in all_urls - found if url.endswith('.html')]
275
276
277def StringifyBrokenLinks(broken_links):
278  '''Prints out broken links in a more readable format.
279  '''
280  def fixed_width(string, width):
281    return "%s%s" % (string, (width - len(string)) * ' ')
282
283  first_col_width = max(len(link[1]) for link in broken_links)
284  second_col_width = max(len(link[2]) for link in broken_links)
285  target = itemgetter(2)
286  output = []
287
288  def pretty_print(link, col_offset=0):
289    return "%s -> %s %s" % (
290        fixed_width(link[1], first_col_width - col_offset),
291        fixed_width(link[2], second_col_width),
292        link[3])
293
294  for target, links in groupby(sorted(broken_links, key=target), target):
295    links = list(links)
296    # Compress messages
297    if len(links) > 50 and not links[0][2].startswith('#'):
298      message = "Found %d broken links (" % len(links)
299      output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
300    else:
301      for link in links:
302        output.append(pretty_print(link))
303
304  return '\n'.join(output)
305