vulkan/scripts/check_html_xrefs.py

#!/usr/bin/python3
#
# Copyright 2020-2021 The Khronos Group Inc.
#
# SPDX-License-Identifier: Apache-2.0

# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
# that don't exist.

# Usage: check_html_xrefs file
# Just reports bad xrefs, not where they occur

import argparse
import re
from lxml import etree

SECTNAME = re.compile(r'sect(?P<level>\d+)')

def find_parent_ids(elem, href):
    """Find section titles in parents, which are the 'id' elements of '<hN'
       children of '<div class="sectM"' tags, and N = M + 1. This may be
       specific to the Vulkan spec, though - hierarchy could be different in
       other asciidoctor documents. Returns a list of [ anchor, title ].

       elem - this node
       href - href link text of elem"""

    # Find parent <div> with class="sect#"
    parent = elem.getparent()
    while parent is not None:
        if parent.tag == 'div':
            cssclass = parent.get('class')
            matches = SECTNAME.match(cssclass)
            if matches is not None:
                level = int(matches.group('level'))
                # Look for corresponding header tag in this div
                helem = parent.find('./h{}'.format(level+1))
                if helem is not None:
                    return [ helem.get('id'), ''.join(helem.itertext()) ]
        parent = parent.getparent()
    return [ '** NO PARENT NODE IDENTIFIED **', '' ]

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('files', metavar='filename', nargs='*',
                        help='Path to registry XML')
    args = parser.parse_args()

    for filename in args.files:
        parser = etree.HTMLParser()
        tree = etree.parse(filename, parser)

        # Find all 'id' elements
        id_elems = tree.findall('.//*[@id]')
        ids = set()
        for elem in id_elems:
            id = elem.get('id')
            if id in ids:
                True
                # print('Duplicate ID attribute:', id)
            else:
                ids.add(id)

        # Find all internal 'href' attributes and see if they're valid
        # Keep an [element, href] list for tracking parents
        # Also keep a count of each href
        ref_elems = tree.findall('.//a[@href]')
        refs = []
        count = {}
        for elem in ref_elems:
            href = elem.get('href')
            # If not a local href, skip it
            if href[0] == '#':
                # If there's a corresponding id, skip it
                href = href[1:]
                if href not in ids:
                    if href in count:
                        refs.append((elem, href))
                        True
                        count[href] = count[href] + 1
                    else:
                        refs.append((elem, href))
                        count[href] = 1
            else:
                True
                # print('Skipping external href:', ref)

        # Check for hrefs not found in ids
        print('Bad links in {}:'.format(filename))
        for (elem, href) in refs:
            parents = find_parent_ids(elem, href)
            print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))