tools/idl_parser/idl_lexer.py

#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

""" Lexer for PPAPI IDL

The lexer uses the PLY library to build a tokenizer which understands both
WebIDL and Pepper tokens.

WebIDL, and WebIDL regular expressions can be found at:
   http://www.w3.org/TR/2012/CR-WebIDL-20120419/
PLY can be found at:
   http://www.dabeaz.com/ply/
"""

import os.path
import sys

#
# Try to load the ply module, if not, then assume it is in the third_party
# directory.
#
try:
  # Disable lint check which fails to find the ply module.
  # pylint: disable=F0401
  from ply import lex
except ImportError:
  module_path, module_name = os.path.split(__file__)
  third_party = os.path.join(module_path, '..', '..', 'third_party')
  sys.path.append(third_party)
  # pylint: disable=F0401
  from ply import lex

#
# IDL Lexer
#
class IDLLexer(object):
  # 'literals' is a value expected by lex which specifies a list of valid
  # literal tokens, meaning the token type and token value are identical.
  literals = r'"*.(){}[],;:=+-/~|&^?<>'

  # 't_ignore' contains ignored characters (spaces and tabs)
  t_ignore = ' \t'

  # 'tokens' is a value required by lex which specifies the complete list
  # of valid token types.
  tokens = [
    # Data types
      'float',
      'integer',
      'string',

    # Symbol and keywords types
      'COMMENT',
      'identifier',

    # MultiChar operators
      'ELLIPSIS',
  ]

  # 'keywords' is a map of string to token type.  All tokens matching
  # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
  # if the token is actually a keyword.
  keywords = {
    'any' : 'ANY',
    'attribute' : 'ATTRIBUTE',
    'boolean' : 'BOOLEAN',
    'byte' : 'BYTE',
    'callback' : 'CALLBACK',
    'const' : 'CONST',
    'creator' : 'CREATOR',
    'Date' : 'DATE',
    'deleter' : 'DELETER',
    'dictionary' : 'DICTIONARY',
    'DOMString' : 'DOMSTRING',
    'double' : 'DOUBLE',
    'enum'  : 'ENUM',
    'false' : 'FALSE',
    'float' : 'FLOAT',
    'exception' : 'EXCEPTION',
    'getter': 'GETTER',
    'implements' : 'IMPLEMENTS',
    'Infinity' : 'INFINITY',
    'inherit' : 'INHERIT',
    'interface' : 'INTERFACE',
    'legacycaller' : 'LEGACYCALLER',
    'long' : 'LONG',
    'Nan' : 'NAN',
    'null' : 'NULL',
    'object' : 'OBJECT',
    'octet' : 'OCTET',
    'optional' : 'OPTIONAL',
    'or' : 'OR',
    'partial'  : 'PARTIAL',
    'readonly' : 'READONLY',
    'sequence' : 'SEQUENCE',
    'setter': 'SETTER',
    'short' : 'SHORT',
    'static' : 'STATIC',
    'stringifier' : 'STRINGIFIER',
    'typedef' : 'TYPEDEF',
    'true' : 'TRUE',
    'unsigned' : 'UNSIGNED',
    'unrestricted' : 'UNRESTRICTED',
    'void' : 'VOID'
  }

  # Token definitions
  #
  # Lex assumes any value or function in the form of 't_<TYPE>' represents a
  # regular expression where a match will emit a token of type <TYPE>.  In the
  # case of a function, the function is called when a match is made. These
  # definitions come from WebIDL.
  #
  # These need to be methods for lexer construction, despite not using self.
  # pylint: disable=R0201
  def t_ELLIPSIS(self, t):
    r'\.\.\.'
    return t

  # Regex needs to be in the docstring
  # pylint: disable=C0301
  def t_float(self, t):
    r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
    return t

  def t_integer(self, t):
    r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
    return t


  # A line ending '\n', we use this to increment the line number
  def t_LINE_END(self, t):
    r'\n+'
    self.AddLines(len(t.value))

  # We do not process escapes in the IDL strings.  Strings are exclusively
  # used for attributes and enums, and not used as typical 'C' constants.
  def t_string(self, t):
    r'"[^"]*"'
    t.value = t.value[1:-1]
    self.AddLines(t.value.count('\n'))
    return t

  # A C or C++ style comment:  /* xxx */ or //
  def t_COMMENT(self, t):
    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    self.AddLines(t.value.count('\n'))
    return t

  # A symbol or keyword.
  def t_KEYWORD_OR_SYMBOL(self, t):
    r'_?[A-Za-z][A-Za-z_0-9]*'

    # All non-keywords are assumed to be symbols
    t.type = self.keywords.get(t.value, 'identifier')

    # We strip leading underscores so that you can specify symbols with the same
    # value as a keywords (E.g. a dictionary named 'interface').
    if t.value[0] == '_':
      t.value = t.value[1:]
    return t

  def t_ANY_error(self, t):
    msg = 'Unrecognized input'
    line = self.Lexer().lineno

    # If that line has not been accounted for, then we must have hit
    # EoF, so compute the beginning of the line that caused the problem.
    if line >= len(self.index):
      # Find the offset in the line of the first word causing the issue
      word = t.value.split()[0]
      offs = self.lines[line - 1].find(word)
      # Add the computed line's starting position
      self.index.append(self.Lexer().lexpos - offs)
      msg = 'Unexpected EoF reached after'

    pos = self.Lexer().lexpos - self.index[line]
    out = self.ErrorMessage(line, pos, msg)
    sys.stderr.write(out + '\n')
    self._lex_errors += 1


  def AddLines(self, count):
    # Set the lexer position for the beginning of the next line.  In the case
    # of multiple lines, tokens can not exist on any of the lines except the
    # last one, so the recorded value for previous lines are unused.  We still
    # fill the array however, to make sure the line count is correct.
    self.Lexer().lineno += count
    for _ in range(count):
      self.index.append(self.Lexer().lexpos)

  def FileLineMsg(self, line, msg):
    # Generate a message containing the file and line number of a token.
    filename = self.Lexer().filename
    if filename:
      return "%s(%d) : %s" % (filename, line + 1, msg)
    return "<BuiltIn> : %s" % msg

  def SourceLine(self, line, pos):
    # Create a source line marker
    caret = ' ' * pos + '^'
    # We decrement the line number since the array is 0 based while the
    # line numbers are 1 based.
    return "%s\n%s" % (self.lines[line - 1], caret)

  def ErrorMessage(self, line, pos, msg):
    return "\n%s\n%s" % (
        self.FileLineMsg(line, msg),
        self.SourceLine(line, pos))

#
# Tokenizer
#
# The token function returns the next token provided by IDLLexer for matching
# against the leaf paterns.
#
  def token(self):
    tok = self.Lexer().token()
    if tok:
      self.last = tok
    return tok


  def GetTokens(self):
    outlist = []
    while True:
      t = self.Lexer().token()
      if not t:
        break
      outlist.append(t)
    return outlist

  def Tokenize(self, data, filename='__no_file__'):
    lexer = self.Lexer()
    lexer.lineno = 1
    lexer.filename = filename
    lexer.input(data)
    self.lines = data.split('\n')

  def KnownTokens(self):
    return self.tokens

  def Lexer(self):
    if not self._lexobj:
      self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
    return self._lexobj

  def _AddToken(self, token):
    if token in self.tokens:
      raise RuntimeError('Same token: ' + token)
    self.tokens.append(token)

  def _AddTokens(self, tokens):
    for token in tokens:
      self._AddToken(token)

  def _AddKeywords(self, keywords):
    for key in keywords:
      value = key.upper()
      self._AddToken(value)
      self.keywords[key] = value

  def _DelKeywords(self, keywords):
    for key in keywords:
      self.tokens.remove(key.upper())
      del self.keywords[key]

  def __init__(self):
    self.index = [0]
    self._lex_errors = 0
    self.linex = []
    self.filename = None
    self.keywords = {}
    self.tokens = []
    self._AddTokens(IDLLexer.tokens)
    self._AddKeywords(IDLLexer.keywords)
    self._lexobj = None
    self.last = None
    self.lines = None

# If run by itself, attempt to build the lexer
if __name__ == '__main__':
  lexer_object = IDLLexer()