src/scripts/reflow.py

#!/usr/bin/python3
#
# Copyright 2016-2024 The Khronos Group Inc.
#
# SPDX-License-Identifier: Apache-2.0

"""Used for automatic reflow of spec sources to satisfy the agreed layout to
minimize git churn.  Also used to insert identifying tags on explicit Valid
Usage statements.

Usage: `reflow.py [-noflow] [-tagvu] [-nextvu #] [-overwrite] [-out dir] [-suffix str] files`

- `-noflow` acts as a passthrough, instead of reflowing text. Other
  processing may occur.
- `-tagvu` generates explicit VUID tag for Valid Usage statements which
  do not already have them.
- `-nextvu #` starts VUID tag generation at the specified # instead of
  the value wired into the `reflow.py` script.
- `-overwrite` updates in place (can be risky, make sure there are backups)
- `-check FAIL|WARN` runs some consistency checks on markup. If the checks
  fail and the WARN option is given, the script will simply print a warning
  message. If the checks fail and the FAIL option is given, the script will
  exit with an error code. FAIL is for use with continuous integration
  scripts enforcing the checks.
- `-out` specifies directory to create output file in, default 'out'
- `-suffix` specifies suffix to add to output files, default ''
- `files` are asciidoc source files from the spec to reflow.
"""
# For error and file-loading interfaces only
import argparse
import os
import re
import sys
from reflib import loadFile, logDiag, logWarn, logErr, setLogFile, getBranch
from pathlib import Path
import doctransformer

# Vulkan-specific - will consolidate into scripts/ like OpenXR soon
sys.path.insert(0, 'xml')

from apiconventions import APIConventions
conventions = APIConventions()

# Patterns used to recognize interesting lines in an asciidoc source file.
# These patterns are only compiled once.

# Find the pname: or code: patterns in a Valid Usage statement
pnamePat = re.compile(r'pname:(?P<param>\{?\w+\}?)')
codePat = re.compile(r'code:(?P<param>\w+)')

# Text that (may) not end sentences

# A single letter followed by a period, typically a middle initial.
endInitial = re.compile(r'^[A-Z]\.$')
# An abbreviation, which does not (usually) end a line.
endAbbrev = re.compile(r'(e\.g|i\.e|c\.f|vs)\.$', re.IGNORECASE)

# Explicit Valid Usage list item with one or more leading asterisks
# The re.DOTALL is needed to prevent vuPat.search() from stripping
# the trailing newline.
vuPat = re.compile(r'^(?P<head>  [*]+)( *)(?P<tail>.*)', re.DOTALL)

# VUID with the numeric portion captured in the match object
vuidPat = re.compile(r'VUID-[^-]+-[^-]+-(?P<vuid>[0-9]+)')

# Pattern matching leading nested bullet points
global nestedVuPat
nestedVuPat = re.compile(r'^  \*\*')

class ReflowCallbacks:
    """State and arguments for reflowing.

    Used with DocTransformer to reflow a file."""
    def __init__(self,
                 filename,
                 vuidDict,
                 margin = 76,
                 breakPeriod = True,
                 reflow = True,
                 nextvu = None,
                 maxvu = None,
                 check = True):

        self.filename = filename
        """base name of file being read from."""

        self.check = check
        """Whether consistency checks must be performed."""

        self.margin = margin
        """margin to reflow text to."""

        self.breakPeriod = breakPeriod
        """True if justification should break to a new line after the end of a
        sentence."""

        self.breakInitial = True
        """True if justification should break to a new line after something
        that appears to be an initial in someone's name. **TBD**"""

        self.reflow = reflow
        """True if text should be reflowed, False to pass through unchanged."""

        self.vuPrefix = 'VUID'
        """Prefix of generated Valid Usage tags"""

        self.vuFormat = '{0}-{1}-{2}-{3:0>5d}'
        """Format string for generating Valid Usage tags.
        First argument is vuPrefix, second is command/struct name, third is
        parameter name, fourth is the tag number."""

        self.nextvu = nextvu
        """Integer to start tagging un-numbered Valid Usage statements with,
        or None if no tagging should be done."""

        self.maxvu = maxvu
        """Maximum tag to use for Valid Usage statements, or None if no
        tagging should be done."""

        self.vuidDict = vuidDict
        """Dictionary of VUID numbers found, containing a list of (file, VUID)
        on which that number was found.  This is used to warn on duplicate
        VUIDs."""

        self.warnCount = 0
        """Count of markup check warnings encountered."""

    def endSentence(self, word):
        """Return True if word ends with a sentence-period, False otherwise.

        Allows for contraction cases which will not end a line:

         - A single letter (if breakInitial is True)
         - Abbreviations: 'c.f.', 'e.g.', 'i.e.' (or mixed-case versions)"""
        if (word[-1:] != '.' or
            endAbbrev.search(word) or
                (self.breakInitial and endInitial.match(word))):
            return False

        return True

    def vuidAnchor(self, word):
        """Return True if word is a Valid Usage ID Tag anchor."""
        return (word[0:7] == '[[VUID-')

    def visitVUID(self, vuid, line):
        if vuid not in self.vuidDict:
            self.vuidDict[vuid] = []
        self.vuidDict[vuid].append([self.filename, line])

    def gatherVUIDs(self, para):
        """Gather VUID tags and add them to vuidDict.  Used to verify no-duplicate VUIDs"""
        for line in para:
            line = line.rstrip()

            matches = vuidPat.search(line)
            if matches is not None:
                vuid = matches.group('vuid')
                self.visitVUID(vuid, line)

    def addVUID(self, para, state):
        hangIndent = state.hangIndent

        """Generate and add VUID if necessary."""
        if not state.isVU or self.nextvu is None:
            return para, hangIndent

        # If:
        #   - this paragraph is in a Valid Usage block,
        #   - VUID tags are being assigned,
        # Try to assign VUIDs

        if nestedVuPat.search(para[0]):
            # Do not assign VUIDs to nested bullet points.
            # These are now allowed VU markup syntax, but will never
            # themselves be VUs, just subsidiary points.
            return para, hangIndent

        # Skip if there is already a VUID assigned
        if self.vuPrefix in para[0]:
            return para, hangIndent

        # If:
        #   - a tag is not already present, and
        #   - the paragraph is a properly marked-up list item
        # Then add a VUID tag starting with the next free ID.

        # Split the first line after the bullet point
        matches = vuPat.search(para[0])
        if matches is None:
            # There are only a few cases of this, and they are all
            # legitimate. Leave detecting this case to another tool
            # or hand inspection.
            # logWarn(self.filename + ': Unexpected non-bullet item in VU block (harmless if following an ifdef):',
            #         para[0])
            return para, hangIndent

        outPara = para

        logDiag('addVUID: Matched vuPat on line:', para[0], end='')
        head = matches.group('head')
        tail = matches.group('tail')

        # Find pname: or code: tags in the paragraph for the purposes of VUID
        # tag generation. pname:{attribute}s are prioritized to make sure
        # commonvalidity VUIDs end up being unique. Otherwise, the first pname:
        # or code: tag in the paragraph is used, which may not always be
        # correct, but should be highly reliable.
        pnameMatches = re.findall(pnamePat, ' '.join(para))
        codeMatches = re.findall(codePat, ' '.join(para))

        # Prioritize {attribute}s, but not the ones in the exception list
        # below.  These have complex expressions including ., ->, or [index]
        # which makes them unsuitable for VUID tags.  Ideally these would be
        # automatically discovered.
        attributeExceptionList = ['maxinstancecheck', 'regionsparam',
                                  'rayGenShaderBindingTableAddress',
                                  'rayGenShaderBindingTableStride',
                                  'missShaderBindingTableAddress',
                                  'missShaderBindingTableStride',
                                  'hitShaderBindingTableAddress',
                                  'hitShaderBindingTableStride',
                                  'callableShaderBindingTableAddress',
                                  'callableShaderBindingTableStride',
                                 ]
        attributeMatches = [match for match in pnameMatches if
                            match[0] == '{' and
                            match[1:-1] not in attributeExceptionList]
        nonattributeMatches = [match for match in pnameMatches if
                               match[0] != '{']

        if len(attributeMatches) > 0:
            paramName = attributeMatches[0]
        elif len(nonattributeMatches) > 0:
            paramName = nonattributeMatches[0]
        elif len(codeMatches) > 0:
            paramName = codeMatches[0]
        else:
            paramName = 'None'
            logWarn(self.filename,
                    'No param name found for VUID tag on line:',
                    para[0])

        # Transform:
        #
        #   * VU first line
        #
        # To:
        #
        #   * [[VUID]]
        #     VU first line
        #
        tagLine = (head + ' [[' +
                   self.vuFormat.format(self.vuPrefix,
                                        state.apiName,
                                        paramName,
                                        self.nextvu) + ']]\n')
        self.visitVUID(str(self.nextvu), tagLine)

        newLines = [tagLine]
        if tail.strip() != '':
            logDiag('transformParagraph first line matches bullet point -'
                    'single line, assuming hangIndent @ input line',
                    state.lineNumber)
            hangIndent = len(head) + 1
            newLines.append(''.ljust(hangIndent) + tail)

        logDiag('Assigning', self.vuPrefix, state.apiName, self.nextvu,
                ' on line:\n' + para[0], '->\n' + newLines[0] + 'END', '\n' + newLines[1] if len(newLines) > 1 else '')

        # Do not actually assign the VUID unless it is in the reserved range
        if self.nextvu <= self.maxvu:
            if self.nextvu == self.maxvu:
                logWarn('Skipping VUID assignment, no more VUIDs available')
            outPara = newLines + para[1:]
            self.nextvu = self.nextvu + 1

        return outPara, hangIndent

    def transformParagraph(self, para, state):
        """Reflow a given paragraph, respecting the paragraph lead and
        hanging indentation levels.

        The algorithm also respects trailing '+' signs that indicate embedded newlines,
        and will not reflow a very long word immediately after a bullet point.

        Just return the paragraph unchanged if the -noflow argument was
        given."""

        self.gatherVUIDs(para)

        # If this is a VU that is missing a VUID, add it to the paragraph now.
        para, hangIndent = self.addVUID(para, state)

        if not self.reflow:
            return para

        logDiag('transformParagraph lead indent = ', state.leadIndent,
                'hangIndent =', state.hangIndent,
                'para:', para[0], end='')

        # Total words processed (we care about the *first* word vs. others)
        wordCount = 0

        # Tracks the *previous* word processed. It must not be empty.
        prevWord = ' '

        # Track the previous line and paragraph being indented, if any
        outLine = None
        outPara = []

        for line in para:
            line = line.rstrip()
            words = line.split()

            # logDiag('transformParagraph: input line =', line)
            numWords = len(words) - 1

            for i in range(0, numWords + 1):
                word = words[i]
                wordLen = len(word)
                wordCount += 1

                endEscape = False
                if i == numWords and word in ('+', '-'):
                    # Trailing ' +' or ' -' must stay on the same line
                    endEscape = word
                    # logDiag('transformParagraph last word of line =', word,
                    #         'prevWord =', prevWord, 'endEscape =', endEscape)
                else:
                    # logDiag('transformParagraph wordCount =', wordCount,
                    #         'word =', word, 'prevWord =', prevWord)
                    pass

                if wordCount == 1:
                    # The first word of the paragraph is treated specially.
                    # The loop logic becomes trickier if all this code is
                    # done prior to looping over lines and words, so all the
                    # setup logic is done here.

                    outLine = ''.ljust(state.leadIndent) + word
                    outLineLen = state.leadIndent + wordLen

                    # If the paragraph begins with a bullet point, generate
                    # a hanging indent level if there is not one already.
                    if doctransformer.beginBullet.match(para[0]):
                        bulletPoint = True
                        if len(para) > 1:
                            logDiag('transformParagraph first line matches bullet point',
                                    'but indent already hanging @ input line',
                                    state.lineNumber)
                        else:
                            logDiag('transformParagraph first line matches bullet point -'
                                    'single line, assuming hangIndent @ input line',
                                    state.lineNumber)
                            hangIndent = outLineLen + 1
                    else:
                        bulletPoint = False
                else:
                    # Possible actions to take with this word
                    #
                    # addWord - add word to current line
                    # closeLine - append line and start a new (null) one
                    # startLine - add word to a new line

                    # Default behavior if all the tests below fail is to add
                    # this word to the current line, and keep accumulating
                    # that line.
                    (addWord, closeLine, startLine) = (True, False, False)

                    # How long would this line be if the word were added?
                    newLen = outLineLen + 1 + wordLen

                    # Are we on the first word following a bullet point?
                    firstBullet = (wordCount == 2 and bulletPoint)

                    if endEscape:
                        # If the new word ends the input line with ' +',
                        # add it to the current line.

                        (addWord, closeLine, startLine) = (True, True, False)
                    elif self.vuidAnchor(word):
                        # If the new word is a Valid Usage anchor, break the
                        # line afterwards. Note that this should only happen
                        # immediately after a bullet point, but we do not
                        # currently check for this.
                        (addWord, closeLine, startLine) = (True, True, False)

                    elif newLen > self.margin:
                        if firstBullet:
                            # If the word follows a bullet point, add it to
                            # the current line no matter its length.

                            (addWord, closeLine, startLine) = (True, True, False)
                        elif doctransformer.beginBullet.match(word + ' '):
                            # If the word *is* a bullet point, add it to
                            # the current line no matter its length.
                            # This avoids an innocent inline '-' or '*'
                            # turning into a bogus bullet point.

                            (addWord, closeLine, startLine) = (True, True, False)
                        else:
                            # The word overflows, so add it to a new line.

                            (addWord, closeLine, startLine) = (False, True, True)
                    elif (self.breakPeriod and
                          (wordCount > 2 or not firstBullet) and
                          self.endSentence(prevWord)):
                        # If the previous word ends a sentence and
                        # breakPeriod is set, start a new line.
                        # The complicated logic allows for leading bullet
                        # points which are periods (implicitly numbered lists).
                        # @@@ But not yet for explicitly numbered lists.

                        (addWord, closeLine, startLine) = (False, True, True)

                    # Add a word to the current line
                    if addWord:
                        if outLine:
                            outLine += ' ' + word
                            outLineLen = newLen
                        else:
                            # Fall through to startLine case if there is no
                            # current line yet.
                            startLine = True

                    # Add current line to the output paragraph. Force
                    # starting a new line, although we do not yet know if it
                    # will ever have contents.
                    if closeLine:
                        if outLine:
                            outPara.append(outLine + '\n')
                            outLine = None

                    # Start a new line and add a word to it
                    if startLine:
                        outLine = ''.ljust(hangIndent) + word
                        outLineLen = hangIndent + wordLen

                # Track the previous word, for use in breaking at end of
                # a sentence
                prevWord = word

        # Add last line to the output paragraph.
        if outLine:
            outPara.append(outLine + '\n')

        return outPara

    def onEmbeddedVUConditional(self, state):
        if self.check:
            logWarn('Detected embedded Valid Usage conditional: {}:{}'.format(
                    self.filename, state.lineNumber - 1))
            # Keep track of warning check count
            self.warnCount = self.warnCount + 1

def reflowFile(filename, args):
    logDiag('reflow: filename', filename)

    # Output file handle and reflow object for this file. There are no race
    # conditions on overwriting the input, but it is not recommended unless
    # you have backing store such as git.

    lines, newline_string = loadFile(filename)
    if lines is None:
        return

    if args.overwrite:
        outFilename = filename
    else:
        outDir = Path(args.outDir).resolve()
        outDir.mkdir(parents=True, exist_ok=True)

        outFilename = str(outDir / (os.path.basename(filename) + args.suffix))

    if args.nowrite:
        fp = None
    else:
        try:
            fp = open(outFilename, 'w', encoding='utf8', newline=newline_string)
        except:
            logWarn('Cannot open output file', outFilename, ':', sys.exc_info()[0])
            return

    callback = ReflowCallbacks(filename,
                               args.vuidDict,
                               margin = args.margin,
                               reflow = not args.noflow,
                               nextvu = args.nextvu,
                               maxvu = args.maxvu,
                               check = args.check)

    transformer = doctransformer.DocTransformer(filename,
                                                outfile = fp,
                                                callback = callback)

    transformer.transformFile(lines)

    if fp is not None:
        fp.close()

    # Update the 'nextvu' value
    if args.nextvu != callback.nextvu:
        logWarn('Updated nextvu to', callback.nextvu, 'after file', filename)
        args.nextvu = callback.nextvu

    args.warnCount += callback.warnCount

def reflowAllAdocFiles(folder_to_reflow, args):
    for root, subdirs, files in os.walk(folder_to_reflow):
        for file in files:
            if file.endswith(conventions.file_suffix):
                file_path = os.path.join(root, file)
                reflowFile(file_path, args)
        for subdir in subdirs:
            sub_folder = os.path.join(root, subdir)
            print('Sub-folder = %s' % sub_folder)
            if subdir.lower() not in conventions.spec_no_reflow_dirs:
                print('   Parsing = %s' % sub_folder)
                reflowAllAdocFiles(sub_folder, args)
            else:
                print('   Skipping = %s' % sub_folder)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-diag', action='store', dest='diagFile',
                        help='Set the diagnostic file')
    parser.add_argument('-warn', action='store', dest='warnFile',
                        help='Set the warning file')
    parser.add_argument('-log', action='store', dest='logFile',
                        help='Set the log file for both diagnostics and warnings')
    parser.add_argument('-overwrite', action='store_true',
                        help='Overwrite input filenames instead of writing different output filenames')
    parser.add_argument('-out', action='store', dest='outDir',
                        default='out',
                        help='Set the output directory in which updated files are generated (default: out)')
    parser.add_argument('-nowrite', action='store_true',
                        help='Do not write output files, for use with -check')
    parser.add_argument('-check', action='store', dest='check',
                        help='Run markup checks and warn if WARN option is given, error exit if FAIL option is given')
    parser.add_argument('-checkVUID', action='store', dest='checkVUID',
                        help='Detect duplicated VUID numbers and warn if WARN option is given, error exit if FAIL option is given')
    parser.add_argument('-tagvu', action='store_true',
                        help='Tag un-tagged Valid Usage statements starting at the value wired into reflow.py')
    parser.add_argument('-nextvu', action='store', dest='nextvu', type=int,
                        default=None,
                        help='Tag un-tagged Valid Usage statements starting at the specified base VUID instead of the value wired into reflow.py')
    parser.add_argument('-maxvu', action='store', dest='maxvu', type=int,
                        default=None,
                        help='Specify maximum VUID instead of the value wired into vuidCounts.py')
    parser.add_argument('-branch', action='store', dest='branch',
                        help='Specify branch to assign VUIDs for')
    parser.add_argument('-noflow', action='store_true', dest='noflow',
                        help='Do not reflow text. Other actions may apply')
    parser.add_argument('-margin', action='store', type=int, dest='margin',
                        default='76',
                        help='Width to reflow text, defaults to 76 characters')
    parser.add_argument('-suffix', action='store', dest='suffix',
                        default='',
                        help='Set the suffix added to updated file names (default: none)')
    parser.add_argument('files', metavar='filename', nargs='*',
                        help='a filename to reflow text in')
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')

    args = parser.parse_args()

    setLogFile(True,  True, args.logFile)
    setLogFile(True, False, args.diagFile)
    setLogFile(False, True, args.warnFile)

    if args.overwrite:
        logWarn("reflow.py: will overwrite all input files")

    errors = ''
    if args.branch is None:
        (args.branch, errors) = getBranch()
    if args.branch is None:
        # This is not fatal unless VUID assignment is required
        if args.tagvu:
            logErr('Cannot determine current git branch, so cannot assign VUIDs:', errors)

    if args.tagvu and args.nextvu is None:
        # Moved here since vuidCounts is only needed in the internal
        # repository
        from vuidCounts import vuidCounts

        if args.branch not in vuidCounts:
            logErr('Branch', args.branch, 'not in vuidCounts, cannot continue')
        maxVUID = vuidCounts[args.branch][1]
        startVUID = vuidCounts[args.branch][2]
        args.nextvu = startVUID
        args.maxvu = maxVUID

    if args.nextvu is not None:
        logWarn('Tagging untagged Valid Usage statements starting at', args.nextvu)

    # Count of markup check warnings encountered
    # This is added to the argparse structure
    args.warnCount = 0

    # Dictionary of VUID numbers found, containing a list of (file, VUID) on
    # which that number was found
    # This is added to the argparse structure
    args.vuidDict = {}

    # If no files are specified, reflow the entire specification chapters folder
    if not args.files:
        folder_to_reflow = conventions.spec_reflow_path
        logWarn('Reflowing all asciidoc files under', folder_to_reflow)
        reflowAllAdocFiles(folder_to_reflow, args)
    else:
        for file in args.files:
            reflowFile(file, args)

    if args.warnCount > 0:
        if args.check == 'FAIL':
            logErr('Failed with', args.warnCount, 'markup errors detected.\n' +
                   'To fix these, you can take actions such as:\n' +
                   '  * Moving conditionals outside VU start / end without changing VU meaning\n' +
                   '  * Refactor conditional text using terminology defined conditionally outside the VU itself\n' +
                   '  * Remove the conditional (allowable when this just affects command / structure / enum names)\n')
        else:
            logWarn('Total warning count for markup issues is', args.warnCount)

    # Look for duplicated VUID numbers
    if args.checkVUID:
        dupVUIDs = 0
        for vuid in sorted(args.vuidDict):
            found = args.vuidDict[vuid]
            if len(found) > 1:
                logWarn('Duplicate VUID number {} found in files:'.format(vuid))
                for (file, vuidLine) in found:
                    logWarn('    {}: {}'.format(file, vuidLine))
                dupVUIDs = dupVUIDs + 1

        if dupVUIDs > 0:
            if args.checkVUID == 'FAIL':
                logErr('Failed with', dupVUIDs, 'duplicated VUID numbers found.\n' +
                       'To fix this, either convert these to commonvalidity VUs if possible, or strip\n' +
                       'the VUIDs from all but one of the duplicates and regenerate new ones.')
            else:
                logWarn('Total number of duplicated VUID numbers is', dupVUIDs)

    if args.nextvu is not None and args.nextvu != startVUID:
        # Update next free VUID to assign
        vuidCounts[args.branch][2] = args.nextvu
        try:
            reflow_count_file_path = os.path.dirname(os.path.realpath(__file__))
            reflow_count_file_path += '/vuidCounts.py'
            reflow_count_file = open(reflow_count_file_path, 'w', encoding='utf8')
            print('# Do not edit this file, unless reserving a new VUID range', file=reflow_count_file)
            print('# VUID ranges reserved for branches', file=reflow_count_file)
            print('# Key is branch name, value is [ start, end, nextfree ]', file=reflow_count_file)
            print('# New reservations must be made by MR to main branch', file=reflow_count_file)
            print('vuidCounts = {', file=reflow_count_file)
            for key in sorted(vuidCounts.keys(), key=lambda k: vuidCounts[k][0]):
                counts = vuidCounts[key]
                print(f"    '{key}': [ {counts[0]}, {counts[1]}, {counts[2]} ],",
                    file=reflow_count_file)
            print('}', file=reflow_count_file)
            reflow_count_file.close()
        except:
            logWarn('Cannot open output count file vuidCounts.py', ':', sys.exc_info()[0])