1#!/usr/bin/python3 2# 3# Copyright 2020-2021 The Khronos Group Inc. 4# 5# SPDX-License-Identifier: Apache-2.0 6 7# check_html_xrefs - simple-minded check for internal xrefs in spec HTML 8# that don't exist. 9 10# Usage: check_html_xrefs file 11# Just reports bad xrefs, not where they occur 12 13import argparse 14import re 15from lxml import etree 16 17SECTNAME = re.compile(r'sect(?P<level>\d+)') 18 19def find_parent_ids(elem, href): 20 """Find section titles in parents, which are the 'id' elements of '<hN' 21 children of '<div class="sectM"' tags, and N = M + 1. This may be 22 specific to the Vulkan spec, though - hierarchy could be different in 23 other asciidoctor documents. Returns a list of [ anchor, title ]. 24 25 elem - this node 26 href - href link text of elem""" 27 28 # Find parent <div> with class="sect#" 29 parent = elem.getparent() 30 while parent is not None: 31 if parent.tag == 'div': 32 cssclass = parent.get('class') 33 matches = SECTNAME.match(cssclass) 34 if matches is not None: 35 level = int(matches.group('level')) 36 # Look for corresponding header tag in this div 37 helem = parent.find('./h{}'.format(level+1)) 38 if helem is not None: 39 return [ helem.get('id'), ''.join(helem.itertext()) ] 40 parent = parent.getparent() 41 return [ '** NO PARENT NODE IDENTIFIED **', '' ] 42 43if __name__ == '__main__': 44 parser = argparse.ArgumentParser() 45 46 parser.add_argument('files', metavar='filename', nargs='*', 47 help='Path to registry XML') 48 args = parser.parse_args() 49 50 for filename in args.files: 51 parser = etree.HTMLParser() 52 tree = etree.parse(filename, parser) 53 54 # Find all 'id' elements 55 id_elems = tree.findall('.//*[@id]') 56 ids = set() 57 for elem in id_elems: 58 id = elem.get('id') 59 if id in ids: 60 True 61 # print('Duplicate ID attribute:', id) 62 else: 63 ids.add(id) 64 65 # Find all internal 'href' attributes and see if they're valid 66 # Keep an [element, href] list for tracking parents 67 # Also keep a count of each href 68 ref_elems = tree.findall('.//a[@href]') 69 refs = [] 70 count = {} 71 for elem in ref_elems: 72 href = elem.get('href') 73 # If not a local href, skip it 74 if href[0] == '#': 75 # If there's a corresponding id, skip it 76 href = href[1:] 77 if href not in ids: 78 if href in count: 79 refs.append((elem, href)) 80 True 81 count[href] = count[href] + 1 82 else: 83 refs.append((elem, href)) 84 count[href] = 1 85 else: 86 True 87 # print('Skipping external href:', ref) 88 89 # Check for hrefs not found in ids 90 print('Bad links in {}:'.format(filename)) 91 for (elem, href) in refs: 92 parents = find_parent_ids(elem, href) 93 print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1])) 94