#!/usr/bin/python3
#
# Copyright 2020-2023 The Khronos Group Inc.
# SPDX-License-Identifier: Apache-2.0

# map_html_anchors - map each id= element in a spec HTML file onto the
# top-level (chapter) id= element it belongs to. Used to rewrite spec
# xrefs for Antora. Prints a Python script containing a dictionary
# mapping each discovered ID into the top-level ID it belongs to, and the
# corresponding title element following the id.
#
# This script is very specific to HTML generated by asciidoctor and
# following conventions of the Vulkan style guide.

# Usage: map_html_anchors.py file.html > xrefMap.py

import argparse
import re
import sys
from lxml import etree

def contains_any_of(words, wordlist):
    """Returns True if any element of 'word' is contained in 'words'

       - words - iterable of words to check against wordlist
       - wordlist - iterable of words"""

    for word in words:
        if word in wordlist:
            return True
    return False

sectNumberPat = re.compile(r'^(Table |)([0-9]+\.)+ *')

def add_id(chapelem, idelem, id_map, chapter_id):
    """Add a ID -> [ chapter ID, title] mapping.

       - chapelem - Element for the chapter containing this ID
       - idelem - Element for the ID itself
       - id_map - dictionary containing the map
       - chapter_id - chapter ID of chapelem"""

    # The actual ID
    id = idelem.get('id')

    # Try to determine the title corresponding to this ID, or '' otherwise
    if idelem.tag == 'a':
        # <a id=> does not have a corresponding title element
        id_title = ''
    elif idelem.tag in (('h2', 'h3', 'h4', 'h4', 'h5', 'h6')):
        # <h# id=> has ((#.)* *title) in the text of its element
        id_title = ''.join(idelem.itertext())
    elif idelem.tag == 'table':
        # <table id=> may be followed by <caption class="title">
        # with 'Table ##. caption' text
        capelem = idelem.find('.//caption[@class="title"]')
        if capelem is not None:
            id_title = ''.join(capelem.itertext())
        else:
            id_title = 'NO TABLE CAPTION FOUND'
    elif idelem.tag == 'div':
        classes = idelem.get('class')
        if classes is not None:
            divclass = classes.split()

            if contains_any_of((('admonitionblock', 'paragraph', 'sidebarblock')), divclass):
                # <div> classes with no title elements (paragraphs or NOTEs)
                id_title = ''
            elif 'listingblock' in divclass:
                # <div id= class="listingblock"> has title == id (used for API includes)
                id_title = id
            elif contains_any_of((('dlist', 'openblock')), divclass):
                # <div> classes with titles in the text of the first
                # <dt class="hdlist1"> element of the div
                #
                # "dlist" are mostly glossary elements
                # "openblock" are mostly SPIR-V keywords
                dtelem = idelem.find('.//dt[@class="hdlist1"]')
                if dtelem is not None:
                    # This may not find text in child Elements of <dt>
                    id_title = ''.join(dtelem.itertext())
                else:
                    # No dtelem text found, this probably means a label on an
                    # API open block
                    id_title = ''
            elif contains_any_of((('ulist', 'imageblock')), divclass):
                # <div> classes with titles in the first
                # <div class="title"> element of the div
                titleelem = idelem.find('.//div[@class="title"]')
                if titleelem is not None:
                    id_title = ''.join(titleelem.itertext())
                else:
                    # No <div class="title"> text found
                    id_title = ''
            else:
                id_title = ''
                print(f'Cannot find title for <div id="{id}" class="{classes}"> - unrecognized class', file=sys.stderr)
        else:
            # <div id=> without a class may have a corresponding <h# id=> with the
            # same id - in this case, the div will be thrown away when the
            # following element is encountered.
            id_title = ''

    if id in id_map:
        val = id_map[id]
        print(f'Replacing key {id} -> ({val[0]}, {val[1]}) with ({chapter_id}, {id_title})', file=sys.stderr)

    # Strip whitespace and leading table or section numbers, if present
    id_title = sectNumberPat.sub('', id_title.strip())

    # Map the xref to the chapter it came from and its title
    id_map[id] = [ chapter_id, id_title ]

def generate_map(id_map, filename, scripttype):
    """Encode the ID map into the specified scripttype ('python' or
       'javascript') in the specified file."""

    fp = open(filename, 'w')

    # Python and JS are extremely similar when the output is just a
    # dictionary of lists of strings.

    if scripttype == 'javascript':
        print('exports.xrefMap = {', file=fp)
    else:
        print('xrefMap = {', file=fp)

    # Sort keys so the can be compared between runs
    for id in sorted(id_map):
        print(f"    '{id}' : [ '{id_map[id][0]}', '{id_map[id][1]}' ],", file=fp)

    print('}', file=fp)

    fp.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()


    parser.add_argument('-jsfile', action='store',
                        default=None,
                        help='Specify name of JavaScript file to generate')
    parser.add_argument('-pyfile', action='store',
                        default=None,
                        help='Specify name of Python file to generate')
    parser.add_argument('files', metavar='filename', nargs=1,
                        help='HTML spec file to map IDs from')
    args = parser.parse_args()

    # Tags whose id elements are anchors (we are not concerned about other
    # tags such as <svg>).
    idtags = (('a', 'div', 'h2', 'h3', 'h4', 'h4', 'h5', 'h6', 'table'))

    # Tags whose id elements we do not care about ('h2' is a special case)
    rejected_tags = (('svg',
                      'circle',
                      'clippath',
                      'defs',
                      'ellipse',
                      'g',
                      'grid',
                      'lineargradient',
                      'marker',
                      'metadata',
                      'namedview',
                      'path',
                      'path-effect',
                      'rect',
                      'stop',
                      'text',
                      'tspan',
        ))

    parser = etree.HTMLParser()

    # There is exactly one HTML filename
    filename = args.files[0]
    tree = etree.parse(filename, parser)

    # Dictionary mapping an ID (anchor) to [chapter ID, ID title],
    # where 'chapter ID' is the ID of the chapter it appears in
    id_map = {}

    # Find each <div class="sect1"> element, which corresponds to a
    # chapter.
    chapter_elems = tree.findall('.//div[@class="sect1"]')
    for chapelem in chapter_elems:
        chapter_id = ''
        h2_elems = chapelem.findall('.//h2[@id]')
        if len(h2_elems) != 1:
            raise UserWarning(f'Error! <div> must have exactly 1 <h2> element, has {len(h2_elems)}')
        else:
            chapter_id = h2_elems[0].get('id')

        for idelem in chapelem.findall('.//*[@id]'):
            if idelem.tag in idtags:
                add_id(chapelem, idelem, id_map, chapter_id)
                True
            elif idelem.tag in rejected_tags:
                # print(f'Rejecting tag {idelem.tag}')
                # Do nothing - for tags we know we do not care about
                True
            else:
                print(f'    Rejecting unknown tag with ID <{idelem.tag} id="{idelem.get("id")}"', file=sys.stderr)
                True

    if args.pyfile is not None:
        generate_map(id_map, args.pyfile, 'python')
    if args.jsfile is not None:
        generate_map(id_map, args.jsfile, 'javascript')