#!/usr/bin/python3 # # Copyright 2020-2023 The Khronos Group Inc. # SPDX-License-Identifier: Apache-2.0 # map_html_anchors - map each id= element in a spec HTML file onto the # top-level (chapter) id= element it belongs to. Used to rewrite spec # xrefs for Antora. Prints a Python script containing a dictionary # mapping each discovered ID into the top-level ID it belongs to, and the # corresponding title element following the id. # # This script is very specific to HTML generated by asciidoctor and # following conventions of the Vulkan style guide. # Usage: map_html_anchors.py file.html > xrefMap.py import argparse import re import sys from lxml import etree def contains_any_of(words, wordlist): """Returns True if any element of 'word' is contained in 'words' - words - iterable of words to check against wordlist - wordlist - iterable of words""" for word in words: if word in wordlist: return True return False sectNumberPat = re.compile(r'^(Table |)([0-9]+\.)+ *') def add_id(chapelem, idelem, id_map, chapter_id): """Add a ID -> [ chapter ID, title] mapping. - chapelem - Element for the chapter containing this ID - idelem - Element for the ID itself - id_map - dictionary containing the map - chapter_id - chapter ID of chapelem""" # The actual ID id = idelem.get('id') # Try to determine the title corresponding to this ID, or '' otherwise if idelem.tag == 'a': # does not have a corresponding title element id_title = '' elif idelem.tag in (('h2', 'h3', 'h4', 'h4', 'h5', 'h6')): # has ((#.)* *title) in the text of its element id_title = ''.join(idelem.itertext()) elif idelem.tag == 'table': # may be followed by
# with 'Table ##. caption' text capelem = idelem.find('.//caption[@class="title"]') if capelem is not None: id_title = ''.join(capelem.itertext()) else: id_title = 'NO TABLE CAPTION FOUND' elif idelem.tag == 'div': classes = idelem.get('class') if classes is not None: divclass = classes.split() if contains_any_of((('admonitionblock', 'paragraph', 'sidebarblock')), divclass): #
classes with no title elements (paragraphs or NOTEs) id_title = '' elif 'listingblock' in divclass: #
has title == id (used for API includes) id_title = id elif contains_any_of((('dlist', 'openblock')), divclass): #
classes with titles in the text of the first #
element of the div # # "dlist" are mostly glossary elements # "openblock" are mostly SPIR-V keywords dtelem = idelem.find('.//dt[@class="hdlist1"]') if dtelem is not None: # This may not find text in child Elements of
id_title = ''.join(dtelem.itertext()) else: # No dtelem text found, this probably means a label on an # API open block id_title = '' elif contains_any_of((('ulist', 'imageblock')), divclass): #
classes with titles in the first #
element of the div titleelem = idelem.find('.//div[@class="title"]') if titleelem is not None: id_title = ''.join(titleelem.itertext()) else: # No
text found id_title = '' else: id_title = '' print(f'Cannot find title for
- unrecognized class', file=sys.stderr) else: #
without a class may have a corresponding with the # same id - in this case, the div will be thrown away when the # following element is encountered. id_title = '' if id in id_map: val = id_map[id] print(f'Replacing key {id} -> ({val[0]}, {val[1]}) with ({chapter_id}, {id_title})', file=sys.stderr) # Strip whitespace and leading table or section numbers, if present id_title = sectNumberPat.sub('', id_title.strip()) # Map the xref to the chapter it came from and its title id_map[id] = [ chapter_id, id_title ] def generate_map(id_map, filename, scripttype): """Encode the ID map into the specified scripttype ('python' or 'javascript') in the specified file.""" fp = open(filename, 'w') # Python and JS are extremely similar when the output is just a # dictionary of lists of strings. if scripttype == 'javascript': print('exports.xrefMap = {', file=fp) else: print('xrefMap = {', file=fp) # Sort keys so the can be compared between runs for id in sorted(id_map): print(f" '{id}' : [ '{id_map[id][0]}', '{id_map[id][1]}' ],", file=fp) print('}', file=fp) fp.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-jsfile', action='store', default=None, help='Specify name of JavaScript file to generate') parser.add_argument('-pyfile', action='store', default=None, help='Specify name of Python file to generate') parser.add_argument('files', metavar='filename', nargs=1, help='HTML spec file to map IDs from') args = parser.parse_args() # Tags whose id elements are anchors (we are not concerned about other # tags such as ). idtags = (('a', 'div', 'h2', 'h3', 'h4', 'h4', 'h5', 'h6', 'table')) # Tags whose id elements we do not care about ('h2' is a special case) rejected_tags = (('svg', 'circle', 'clippath', 'defs', 'ellipse', 'g', 'grid', 'lineargradient', 'marker', 'metadata', 'namedview', 'path', 'path-effect', 'rect', 'stop', 'text', 'tspan', )) parser = etree.HTMLParser() # There is exactly one HTML filename filename = args.files[0] tree = etree.parse(filename, parser) # Dictionary mapping an ID (anchor) to [chapter ID, ID title], # where 'chapter ID' is the ID of the chapter it appears in id_map = {} # Find each
element, which corresponds to a # chapter. chapter_elems = tree.findall('.//div[@class="sect1"]') for chapelem in chapter_elems: chapter_id = '' h2_elems = chapelem.findall('.//h2[@id]') if len(h2_elems) != 1: raise UserWarning(f'Error!
must have exactly 1

element, has {len(h2_elems)}') else: chapter_id = h2_elems[0].get('id') for idelem in chapelem.findall('.//*[@id]'): if idelem.tag in idtags: add_id(chapelem, idelem, id_map, chapter_id) True elif idelem.tag in rejected_tags: # print(f'Rejecting tag {idelem.tag}') # Do nothing - for tags we know we do not care about True else: print(f' Rejecting unknown tag with ID <{idelem.tag} id="{idelem.get("id")}"', file=sys.stderr) True if args.pyfile is not None: generate_map(id_map, args.pyfile, 'python') if args.jsfile is not None: generate_map(id_map, args.jsfile, 'javascript')