examples/pygments-lexer/yaml.py

"""
yaml.py

Lexer for YAML, a human-friendly data serialization language
(http://yaml.org/).

Written by Kirill Simonov <xi@resolvent.net>.

License: Whatever suitable for inclusion into the Pygments package.
"""

from pygments.lexer import  \
        ExtendedRegexLexer, LexerContext, include, bygroups
from pygments.token import  \
        Text, Comment, Punctuation, Name, Literal

__all__ = ['YAMLLexer']


class YAMLLexerContext(LexerContext):
    """Indentation context for the YAML lexer."""

    def __init__(self, *args, **kwds):
        super(YAMLLexerContext, self).__init__(*args, **kwds)
        self.indent_stack = []
        self.indent = -1
        self.next_indent = 0
        self.block_scalar_indent = None


def something(TokenClass):
    """Do not produce empty tokens."""
    def callback(lexer, match, context):
        text = match.group()
        if not text:
            return
        yield match.start(), TokenClass, text
        context.pos = match.end()
    return callback

def reset_indent(TokenClass):
    """Reset the indentation levels."""
    def callback(lexer, match, context):
        text = match.group()
        context.indent_stack = []
        context.indent = -1
        context.next_indent = 0
        context.block_scalar_indent = None
        yield match.start(), TokenClass, text
        context.pos = match.end()
    return callback

def save_indent(TokenClass, start=False):
    """Save a possible indentation level."""
    def callback(lexer, match, context):
        text = match.group()
        extra = ''
        if start:
            context.next_indent = len(text)
            if context.next_indent < context.indent:
                while context.next_indent < context.indent:
                    context.indent = context.indent_stack.pop()
                if context.next_indent > context.indent:
                    extra = text[context.indent:]
                    text = text[:context.indent]
        else:
            context.next_indent += len(text)
        if text:
            yield match.start(), TokenClass, text
        if extra:
            yield match.start()+len(text), TokenClass.Error, extra
        context.pos = match.end()
    return callback

def set_indent(TokenClass, implicit=False):
    """Set the previously saved indentation level."""
    def callback(lexer, match, context):
        text = match.group()
        if context.indent < context.next_indent:
            context.indent_stack.append(context.indent)
            context.indent = context.next_indent
        if not implicit:
            context.next_indent += len(text)
        yield match.start(), TokenClass, text
        context.pos = match.end()
    return callback

def set_block_scalar_indent(TokenClass):
    """Set an explicit indentation level for a block scalar."""
    def callback(lexer, match, context):
        text = match.group()
        context.block_scalar_indent = None
        if not text:
            return
        increment = match.group(1)
        if increment:
            current_indent = max(context.indent, 0)
            increment = int(increment)
            context.block_scalar_indent = current_indent + increment
        if text:
            yield match.start(), TokenClass, text
            context.pos = match.end()
    return callback

def parse_block_scalar_empty_line(IndentTokenClass, ContentTokenClass):
    """Process an empty line in a block scalar."""
    def callback(lexer, match, context):
        text = match.group()
        if (context.block_scalar_indent is None or
                len(text) <= context.block_scalar_indent):
            if text:
                yield match.start(), IndentTokenClass, text
        else:
            indentation = text[:context.block_scalar_indent]
            content = text[context.block_scalar_indent:]
            yield match.start(), IndentTokenClass, indentation
            yield (match.start()+context.block_scalar_indent,
                    ContentTokenClass, content)
        context.pos = match.end()
    return callback

def parse_block_scalar_indent(TokenClass):
    """Process indentation spaces in a block scalar."""
    def callback(lexer, match, context):
        text = match.group()
        if context.block_scalar_indent is None:
            if len(text) <= max(context.indent, 0):
                context.stack.pop()
                context.stack.pop()
                return
            context.block_scalar_indent = len(text)
        else:
            if len(text) < context.block_scalar_indent:
                context.stack.pop()
                context.stack.pop()
                return
        if text:
            yield match.start(), TokenClass, text
            context.pos = match.end()
    return callback

def parse_plain_scalar_indent(TokenClass):
    """Process indentation spaces in a plain scalar."""
    def callback(lexer, match, context):
        text = match.group()
        if len(text) <= context.indent:
            context.stack.pop()
            context.stack.pop()
            return
        if text:
            yield match.start(), TokenClass, text
            context.pos = match.end()
    return callback


class YAMLLexer(ExtendedRegexLexer):
    """Lexer for the YAML language."""

    name = 'YAML'
    aliases = ['yaml']
    filenames = ['*.yaml', '*.yml']
    mimetypes = ['text/x-yaml']

    tokens = {

        # the root rules
        'root': [
            # ignored whitespaces
            (r'[ ]+(?=#|$)', Text.Blank),
            # line breaks
            (r'\n+', Text.Break),
            # a comment
            (r'#[^\n]*', Comment.Single),
            # the '%YAML' directive
            (r'^%YAML(?=[ ]|$)', reset_indent(Name.Directive),
                'yaml-directive'),
            # the %TAG directive
            (r'^%TAG(?=[ ]|$)', reset_indent(Name.Directive),
                'tag-directive'),
            # document start and document end indicators
            (r'^(?:---|\.\.\.)(?=[ ]|$)',
                reset_indent(Punctuation.Document), 'block-line'),
            # indentation spaces
            (r'[ ]*(?![ \t\n\r\f\v]|$)',
                save_indent(Text.Indent, start=True),
                ('block-line', 'indentation')),
        ],

        # trailing whitespaces after directives or a block scalar indicator
        'ignored-line': [
            # ignored whitespaces
            (r'[ ]+(?=#|$)', Text.Blank),
            # a comment
            (r'#[^\n]*', Comment.Single),
            # line break
            (r'\n', Text.Break, '#pop:2'),
        ],

        # the %YAML directive
        'yaml-directive': [
            # the version number
            (r'([ ]+)([0-9]+\.[0-9]+)',
                bygroups(Text.Blank, Literal.Version), 'ignored-line'),
        ],

        # the %YAG directive
        'tag-directive': [
            # a tag handle and the corresponding prefix
            (r'([ ]+)(!|![0-9A-Za-z_-]*!)'
                r'([ ]+)(!|!?[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)',
                bygroups(Text.Blank, Name.Type, Text.Blank, Name.Type),
                'ignored-line'),
        ],

        # block scalar indicators and indentation spaces
        'indentation': [
            # trailing whitespaces are ignored
            (r'[ ]*$', something(Text.Blank), '#pop:2'),
            # whitespaces preceding block collection indicators
            (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text.Indent)),
            # block collection indicators
            (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
            # the beginning a block line
            (r'[ ]*', save_indent(Text.Indent), '#pop'),
        ],

        # an indented line in the block context
        'block-line': [
            # the line end
            (r'[ ]*(?=#|$)', something(Text.Blank), '#pop'),
            # whitespaces separating tokens
            (r'[ ]+', Text.Blank),
            # tags, anchors and aliases,
            include('descriptors'),
            # block collections and scalars
            include('block-nodes'),
            # flow collections and quoted scalars
            include('flow-nodes'),
            # a plain scalar
            (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`-]|[?:-][^ \t\n\r\f\v])',
                something(Literal.Scalar.Plain),
                'plain-scalar-in-block-context'),
        ],

        # tags, anchors, aliases
        'descriptors' : [
            # a full-form tag
            (r'!<[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+>', Name.Type),
            # a tag in the form '!', '!suffix' or '!handle!suffix'
            (r'!(?:[0-9A-Za-z_-]+)?'
                r'(?:![0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)?', Name.Type),
            # an anchor
            (r'&[0-9A-Za-z_-]+', Name.Anchor),
            # an alias
            (r'\*[0-9A-Za-z_-]+', Name.Alias),
        ],

        # block collections and scalars
        'block-nodes': [
            # implicit key
            (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
            # literal and folded scalars
            (r'[|>]', Punctuation.Indicator,
                ('block-scalar-content', 'block-scalar-header')),
        ],

        # flow collections and quoted scalars
        'flow-nodes': [
            # a flow sequence
            (r'\[', Punctuation.Indicator, 'flow-sequence'),
            # a flow mapping
            (r'\{', Punctuation.Indicator, 'flow-mapping'),
            # a single-quoted scalar
            (r'\'', Literal.Scalar.Flow.Quote, 'single-quoted-scalar'),
            # a double-quoted scalar
            (r'\"', Literal.Scalar.Flow.Quote, 'double-quoted-scalar'),
        ],

        # the content of a flow collection
        'flow-collection': [
            # whitespaces
            (r'[ ]+', Text.Blank),
            # line breaks
            (r'\n+', Text.Break),
            # a comment
            (r'#[^\n]*', Comment.Single),
            # simple indicators
            (r'[?:,]', Punctuation.Indicator),
            # tags, anchors and aliases
            include('descriptors'),
            # nested collections and quoted scalars
            include('flow-nodes'),
            # a plain scalar
            (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`])',
                something(Literal.Scalar.Plain),
                'plain-scalar-in-flow-context'),
        ],

        # a flow sequence indicated by '[' and ']'
        'flow-sequence': [
            # include flow collection rules
            include('flow-collection'),
            # the closing indicator
            (r'\]', Punctuation.Indicator, '#pop'),
        ],

        # a flow mapping indicated by '{' and '}'
        'flow-mapping': [
            # include flow collection rules
            include('flow-collection'),
            # the closing indicator
            (r'\}', Punctuation.Indicator, '#pop'),
        ],

        # block scalar lines
        'block-scalar-content': [
            # line break
            (r'\n', Text.Break),
            # empty line
            (r'^[ ]+$',
                parse_block_scalar_empty_line(Text.Indent,
                    Literal.Scalar.Block)),
            # indentation spaces (we may leave the state here)
            (r'^[ ]*', parse_block_scalar_indent(Text.Indent)),
            # line content
            (r'[^\n\r\f\v]+', Literal.Scalar.Block),
        ],

        # the content of a literal or folded scalar
        'block-scalar-header': [
            # indentation indicator followed by chomping flag
            (r'([1-9])?[+-]?(?=[ ]|$)',
                set_block_scalar_indent(Punctuation.Indicator),
                'ignored-line'),
            # chomping flag followed by indentation indicator
            (r'[+-]?([1-9])?(?=[ ]|$)',
                set_block_scalar_indent(Punctuation.Indicator),
                'ignored-line'),
        ],

        # ignored and regular whitespaces in quoted scalars
        'quoted-scalar-whitespaces': [
            # leading and trailing whitespaces are ignored
            (r'^[ ]+|[ ]+$', Text.Blank),
            # line breaks are ignored
            (r'\n+', Text.Break),
            # other whitespaces are a part of the value
            (r'[ ]+', Literal.Scalar.Flow),
        ],

        # single-quoted scalars
        'single-quoted-scalar': [
            # include whitespace and line break rules
            include('quoted-scalar-whitespaces'),
            # escaping of the quote character
            (r'\'\'', Literal.Scalar.Flow.Escape),
            # regular non-whitespace characters
            (r'[^ \t\n\r\f\v\']+', Literal.Scalar.Flow),
            # the closing quote
            (r'\'', Literal.Scalar.Flow.Quote, '#pop'),
        ],

        # double-quoted scalars
        'double-quoted-scalar': [
            # include whitespace and line break rules
            include('quoted-scalar-whitespaces'),
            # escaping of special characters
            (r'\\[0abt\tn\nvfre "\\N_LP]', Literal.Scalar.Flow.Escape),
            # escape codes
            (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
                Literal.Scalar.Flow.Escape),
            # regular non-whitespace characters
            (r'[^ \t\n\r\f\v\"\\]+', Literal.Scalar.Flow),
            # the closing quote
            (r'"', Literal.Scalar.Flow.Quote, '#pop'),
        ],

        # the beginning of a new line while scanning a plain scalar
        'plain-scalar-in-block-context-new-line': [
            # empty lines
            (r'^[ ]+$', Text.Blank),
            # line breaks
            (r'\n+', Text.Break),
            # document start and document end indicators
            (r'^(?=---|\.\.\.)', something(Punctuation.Document), '#pop:3'),
            # indentation spaces (we may leave the block line state here)
            (r'^[ ]*', parse_plain_scalar_indent(Text.Indent), '#pop'),
        ],

        # a plain scalar in the block context
        'plain-scalar-in-block-context': [
            # the scalar ends with the ':' indicator
            (r'[ ]*(?=:[ ]|:$)', something(Text.Blank), '#pop'),
            # the scalar ends with whitespaces followed by a comment
            (r'[ ]+(?=#)', Text.Blank, '#pop'),
            # trailing whitespaces are ignored
            (r'[ ]+$', Text.Blank),
            # line breaks are ignored
            (r'\n+', Text.Break, 'plain-scalar-in-block-context-new-line'),
            # other whitespaces are a part of the value
            (r'[ ]+', Literal.Scalar.Plain),
            # regular non-whitespace characters
            (r'(?::(?![ \t\n\r\f\v])|[^ \t\n\r\f\v:])+',
                Literal.Scalar.Plain),
        ],

        # a plain scalar is the flow context
        'plain-scalar-in-flow-context': [
            # the scalar ends with an indicator character
            (r'[ ]*(?=[,:?\[\]{}])', something(Text.Blank), '#pop'),
            # the scalar ends with a comment
            (r'[ ]+(?=#)', Text.Blank, '#pop'),
            # leading and trailing whitespaces are ignored
            (r'^[ ]+|[ ]+$', Text.Blank),
            # line breaks are ignored
            (r'\n+', Text.Break),
            # other whitespaces are a part of the value
            (r'[ ]+', Literal.Scalar.Plain),
            # regular non-whitespace characters
            (r'[^ \t\n\r\f\v,:?\[\]{}]+', Literal.Scalar.Plain),
        ],

    }

    def get_tokens_unprocessed(self, text=None, context=None):
        if context is None:
            context = YAMLLexerContext(text, 0)
        return super(YAMLLexer, self).get_tokens_unprocessed(text, context)