1#!/usr/bin/python3 2# 3# Copyright 2020-2022 The Khronos Group Inc. 4# 5# SPDX-License-Identifier: Apache-2.0 6 7# check_html_xrefs - simple-minded check for internal xrefs in spec HTML 8# that do not exist. 9 10# Usage: check_html_xrefs file 11# Just reports bad xrefs, not where they occur 12 13import argparse 14import re 15import sys 16from lxml import etree 17 18SECTNAME = re.compile(r'sect(?P<level>\d+)') 19 20def find_parent_ids(elem, href): 21 """Find section titles in parents, which are the 'id' elements of '<hN' 22 children of '<div class="sectM"' tags, and N = M + 1. This may be 23 specific to the Vulkan spec, though - hierarchy could be different in 24 other asciidoctor documents. Returns a list of [ anchor, title ]. 25 26 elem - this node 27 href - href link text of elem""" 28 29 # Find parent <div> with class="sect#" 30 parent = elem.getparent() 31 while parent is not None: 32 if parent.tag == 'div': 33 cssclass = parent.get('class') 34 matches = SECTNAME.match(cssclass) 35 if matches is not None: 36 level = int(matches.group('level')) 37 # Look for corresponding header tag in this div 38 helem = parent.find('./h{}'.format(level+1)) 39 if helem is not None: 40 return [ helem.get('id'), ''.join(helem.itertext()) ] 41 parent = parent.getparent() 42 return [ '** NO PARENT NODE IDENTIFIED **', '' ] 43 44if __name__ == '__main__': 45 parser = argparse.ArgumentParser() 46 47 parser.add_argument('files', metavar='filename', nargs='*', 48 help='Path to registry XML') 49 args = parser.parse_args() 50 51 for filename in args.files: 52 parser = etree.HTMLParser() 53 tree = etree.parse(filename, parser) 54 55 # Find all 'id' elements 56 id_elems = tree.findall('.//*[@id]') 57 ids = set() 58 for elem in id_elems: 59 id = elem.get('id') 60 if id in ids: 61 True 62 # print('Duplicate ID attribute:', id) 63 else: 64 ids.add(id) 65 66 # Find all internal 'href' attributes and see if they are valid 67 # Keep an [element, href] list for tracking parents 68 # Also keep a count of each href 69 ref_elems = tree.findall('.//a[@href]') 70 refs = [] 71 count = {} 72 for elem in ref_elems: 73 href = elem.get('href') 74 # If not a local href, skip it 75 if href[0] == '#': 76 # If there is a corresponding id, skip it 77 href = href[1:] 78 if href not in ids: 79 if href in count: 80 refs.append((elem, href)) 81 True 82 count[href] = count[href] + 1 83 else: 84 refs.append((elem, href)) 85 count[href] = 1 86 else: 87 True 88 # print('Skipping external href:', ref) 89 90 # Check for hrefs not found in ids 91 if len(refs) > 0: 92 print('Found bad links in {}:'.format(filename)) 93 for (elem, href) in refs: 94 parents = find_parent_ids(elem, href) 95 print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1])) 96 sys.exit(1) 97