• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python3
2#
3# Copyright 2020-2022 The Khronos Group Inc.
4#
5# SPDX-License-Identifier: Apache-2.0
6
7# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
8# that do not exist.
9
10# Usage: check_html_xrefs file
11# Just reports bad xrefs, not where they occur
12
13import argparse
14import re
15import sys
16from lxml import etree
17
18SECTNAME = re.compile(r'sect(?P<level>\d+)')
19
20def find_parent_ids(elem, href):
21    """Find section titles in parents, which are the 'id' elements of '<hN'
22       children of '<div class="sectM"' tags, and N = M + 1. This may be
23       specific to the Vulkan spec, though - hierarchy could be different in
24       other asciidoctor documents. Returns a list of [ anchor, title ].
25
26       elem - this node
27       href - href link text of elem"""
28
29    # Find parent <div> with class="sect#"
30    parent = elem.getparent()
31    while parent is not None:
32        if parent.tag == 'div':
33            cssclass = parent.get('class')
34            matches = SECTNAME.match(cssclass)
35            if matches is not None:
36                level = int(matches.group('level'))
37                # Look for corresponding header tag in this div
38                helem = parent.find('./h{}'.format(level+1))
39                if helem is not None:
40                    return [ helem.get('id'), ''.join(helem.itertext()) ]
41        parent = parent.getparent()
42    return [ '** NO PARENT NODE IDENTIFIED **', '' ]
43
44if __name__ == '__main__':
45    parser = argparse.ArgumentParser()
46
47    parser.add_argument('files', metavar='filename', nargs='*',
48                        help='Path to registry XML')
49    args = parser.parse_args()
50
51    for filename in args.files:
52        parser = etree.HTMLParser()
53        tree = etree.parse(filename, parser)
54
55        # Find all 'id' elements
56        id_elems = tree.findall('.//*[@id]')
57        ids = set()
58        for elem in id_elems:
59            id = elem.get('id')
60            if id in ids:
61                True
62                # print('Duplicate ID attribute:', id)
63            else:
64                ids.add(id)
65
66        # Find all internal 'href' attributes and see if they are valid
67        # Keep an [element, href] list for tracking parents
68        # Also keep a count of each href
69        ref_elems = tree.findall('.//a[@href]')
70        refs = []
71        count = {}
72        for elem in ref_elems:
73            href = elem.get('href')
74            # If not a local href, skip it
75            if href[0] == '#':
76                # If there is a corresponding id, skip it
77                href = href[1:]
78                if href not in ids:
79                    if href in count:
80                        refs.append((elem, href))
81                        True
82                        count[href] = count[href] + 1
83                    else:
84                        refs.append((elem, href))
85                        count[href] = 1
86            else:
87                True
88                # print('Skipping external href:', ref)
89
90        # Check for hrefs not found in ids
91        if len(refs) > 0:
92            print('Found bad links in {}:'.format(filename))
93            for (elem, href) in refs:
94                parents = find_parent_ids(elem, href)
95                print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))
96            sys.exit(1)
97