1#!/usr/bin/python3 2# Copyright 2013-2023 The Khronos Group Inc. 3# SPDX-License-Identifier: Apache-2.0 4 5# linkcheck - check internal links of the specified HTML file against 6# internal anchors and report inconsistencies. 7# 8# Usage: linkcheck file.html 9 10import argparse 11from lxml import etree as et 12 13def printSet(s): 14 for key in sorted(s): 15 print(' {}'.format(key)) 16 17def checkLinks(file, args): 18 parser = et.HTMLParser() 19 tree = et.parse(file, parser) 20 21 # Remove all <svg> elements, which just add noise to the cross-referencing 22 for svg in tree.findall('//svg'): 23 svg.getparent().remove(svg) 24 25 # Extract elements with href= and id= attributes 26 hrefs = tree.findall('//*[@href]') 27 ids = tree.findall('//*[@id]') 28 29 # Extract xref name from each xref 30 internals = set() 31 externals = set() 32 33 for e in hrefs: 34 # Do not track '<link>' tags from HTML headers 35 if e.tag != 'link': 36 xref = e.get('href') 37 38 if xref[0:1] == '#': 39 # Internal anchor 40 internals.add(xref[1:]) 41 else: 42 externals.add(xref) 43 44 # Extract anchor name from each id 45 anchors = set() 46 47 for e in ids: 48 # Do not track SVG '<g>' tags 49 if e.tag != 'g': 50 anchors.add(e.get('id')) 51 52 # Intersect them to find inconsistencies 53 xrefsOnly = internals.difference(anchors) 54 anchorsOnly = anchors.difference(internals) 55 56 # print('External xrefs:', len(externals)) 57 # printSet(externals) 58 # 59 # print('Internal xrefs:', len(internals)) 60 # print('Anchors: ', len(anchors)) 61 62 print('Internal xrefs not in anchors:', len(xrefsOnly)) 63 printSet(xrefsOnly) 64 65 if args.anchors: 66 print('Internal anchors not in xrefs:', len(anchorsOnly)) 67 printSet(anchorsOnly) 68 69# Patterns used to recognize interesting lines in an asciidoc source file. 70# These patterns are only compiled once. 71 72if __name__ == '__main__': 73 parser = argparse.ArgumentParser() 74 75 parser.add_argument('files', metavar='filename', nargs='*', 76 help='a filename to promote text in') 77 parser.add_argument('-anchors', action='store_true', 78 help='Report orphaned anchors') 79 80 81 args = parser.parse_args() 82 83 for file in args.files: 84 checkLinks(file, args) 85