• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python3
2#
3# Copyright 2020-2021 The Khronos Group Inc.
4#
5# SPDX-License-Identifier: Apache-2.0
6
7# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
8# that don't exist.
9
10# Usage: check_html_xrefs file
11# Just reports bad xrefs, not where they occur
12
13import argparse
14import re
15from lxml import etree
16
17SECTNAME = re.compile(r'sect(?P<level>\d+)')
18
19def find_parent_ids(elem, href):
20    """Find section titles in parents, which are the 'id' elements of '<hN'
21       children of '<div class="sectM"' tags, and N = M + 1. This may be
22       specific to the Vulkan spec, though - hierarchy could be different in
23       other asciidoctor documents. Returns a list of [ anchor, title ].
24
25       elem - this node
26       href - href link text of elem"""
27
28    # Find parent <div> with class="sect#"
29    parent = elem.getparent()
30    while parent is not None:
31        if parent.tag == 'div':
32            cssclass = parent.get('class')
33            matches = SECTNAME.match(cssclass)
34            if matches is not None:
35                level = int(matches.group('level'))
36                # Look for corresponding header tag in this div
37                helem = parent.find('./h{}'.format(level+1))
38                if helem is not None:
39                    return [ helem.get('id'), ''.join(helem.itertext()) ]
40        parent = parent.getparent()
41    return [ '** NO PARENT NODE IDENTIFIED **', '' ]
42
43if __name__ == '__main__':
44    parser = argparse.ArgumentParser()
45
46    parser.add_argument('files', metavar='filename', nargs='*',
47                        help='Path to registry XML')
48    args = parser.parse_args()
49
50    for filename in args.files:
51        parser = etree.HTMLParser()
52        tree = etree.parse(filename, parser)
53
54        # Find all 'id' elements
55        id_elems = tree.findall('.//*[@id]')
56        ids = set()
57        for elem in id_elems:
58            id = elem.get('id')
59            if id in ids:
60                True
61                # print('Duplicate ID attribute:', id)
62            else:
63                ids.add(id)
64
65        # Find all internal 'href' attributes and see if they're valid
66        # Keep an [element, href] list for tracking parents
67        # Also keep a count of each href
68        ref_elems = tree.findall('.//a[@href]')
69        refs = []
70        count = {}
71        for elem in ref_elems:
72            href = elem.get('href')
73            # If not a local href, skip it
74            if href[0] == '#':
75                # If there's a corresponding id, skip it
76                href = href[1:]
77                if href not in ids:
78                    if href in count:
79                        refs.append((elem, href))
80                        True
81                        count[href] = count[href] + 1
82                    else:
83                        refs.append((elem, href))
84                        count[href] = 1
85            else:
86                True
87                # print('Skipping external href:', ref)
88
89        # Check for hrefs not found in ids
90        print('Bad links in {}:'.format(filename))
91        for (elem, href) in refs:
92            parents = find_parent_ids(elem, href)
93            print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))
94