# Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import imp import os.path import sys def _GetDirAbove(dirname): """Returns the directory "above" this file containing |dirname| (which must also be "above" this file).""" path = os.path.abspath(__file__) while True: path, tail = os.path.split(path) assert tail if tail == dirname: return path try: imp.find_module("ply") except ImportError: sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party")) from ply.lex import TOKEN from ..error import Error class LexError(Error): """Class for errors from the lexer.""" def __init__(self, filename, message, lineno): Error.__init__(self, filename, message, lineno=lineno) # We have methods which look like they could be functions: # pylint: disable=R0201 class Lexer(object): def __init__(self, filename): self.filename = filename ######################-- PRIVATE --###################### ## ## Internal auxiliary methods ## def _error(self, msg, token): raise LexError(self.filename, msg, token.lineno) ## ## Reserved keywords ## keywords = ( 'HANDLE', 'IMPORT', 'MODULE', 'STRUCT', 'UNION', 'INTERFACE', 'ENUM', 'CONST', 'TRUE', 'FALSE', 'DEFAULT', 'ARRAY', 'MAP', 'ASSOCIATED' ) keyword_map = {} for keyword in keywords: keyword_map[keyword.lower()] = keyword ## ## All the tokens recognized by the lexer ## tokens = keywords + ( # Identifiers 'NAME', # Constants 'ORDINAL', 'INT_CONST_DEC', 'INT_CONST_HEX', 'FLOAT_CONST', # String literals 'STRING_LITERAL', # Operators 'MINUS', 'PLUS', 'AMP', 'QSTN', # Assignment 'EQUALS', # Request / response 'RESPONSE', # Delimiters 'LPAREN', 'RPAREN', # ( ) 'LBRACKET', 'RBRACKET', # [ ] 'LBRACE', 'RBRACE', # { } 'LANGLE', 'RANGLE', # < > 'SEMI', # ; 'COMMA', 'DOT' # , . ) ## ## Regexes for use in tokens ## # valid C identifiers (K&R2: A.2.3) identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' hex_prefix = '0[xX]' hex_digits = '[0-9a-fA-F]+' # integer constants (K&R2: A.2.5.1) decimal_constant = '0|([1-9][0-9]*)' hex_constant = hex_prefix+hex_digits # Don't allow octal constants (even invalid octal). octal_constant_disallowed = '0[0-9]+' # character constants (K&R2: A.2.5.2) # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line # directives with Windows paths as filenames (..\..\dir\file) # For the same reason, decimal_escape allows all digit sequences. We want to # parse all correct code, even if it means to sometimes parse incorrect # code. # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" decimal_escape = r"""(\d+)""" hex_escape = r"""(x[0-9a-fA-F]+)""" bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" escape_sequence = \ r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' # string literals (K&R2: A.2.6) string_char = r"""([^"\\\n]|"""+escape_sequence+')' string_literal = '"'+string_char+'*"' bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' # floating constants (K&R2: A.2.5.3) exponent_part = r"""([eE][-+]?[0-9]+)""" fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" floating_constant = \ '(((('+fractional_constant+')'+ \ exponent_part+'?)|([0-9]+'+exponent_part+')))' # Ordinals ordinal = r'@[0-9]+' missing_ordinal_value = r'@' # Don't allow ordinal values in octal (even invalid octal, like 09) or # hexadecimal. octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' ## ## Rules for the normal state ## t_ignore = ' \t\r' # Newlines def t_NEWLINE(self, t): r'\n+' t.lexer.lineno += len(t.value) # Operators t_MINUS = r'-' t_PLUS = r'\+' t_AMP = r'&' t_QSTN = r'\?' # = t_EQUALS = r'=' # => t_RESPONSE = r'=>' # Delimiters t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACKET = r'\[' t_RBRACKET = r'\]' t_LBRACE = r'\{' t_RBRACE = r'\}' t_LANGLE = r'<' t_RANGLE = r'>' t_COMMA = r',' t_DOT = r'\.' t_SEMI = r';' t_STRING_LITERAL = string_literal # The following floating and integer constants are defined as # functions to impose a strict order (otherwise, decimal # is placed before the others because its regex is longer, # and this is bad) # @TOKEN(floating_constant) def t_FLOAT_CONST(self, t): return t @TOKEN(hex_constant) def t_INT_CONST_HEX(self, t): return t @TOKEN(octal_constant_disallowed) def t_OCTAL_CONSTANT_DISALLOWED(self, t): msg = "Octal values not allowed" self._error(msg, t) @TOKEN(decimal_constant) def t_INT_CONST_DEC(self, t): return t # unmatched string literals are caught by the preprocessor @TOKEN(bad_string_literal) def t_BAD_STRING_LITERAL(self, t): msg = "String contains invalid escape code" self._error(msg, t) # Handle ordinal-related tokens in the right order: @TOKEN(octal_or_hex_ordinal_disallowed) def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): msg = "Octal and hexadecimal ordinal values not allowed" self._error(msg, t) @TOKEN(ordinal) def t_ORDINAL(self, t): return t @TOKEN(missing_ordinal_value) def t_BAD_ORDINAL(self, t): msg = "Missing ordinal value" self._error(msg, t) @TOKEN(identifier) def t_NAME(self, t): t.type = self.keyword_map.get(t.value, "NAME") return t # Ignore C and C++ style comments def t_COMMENT(self, t): r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' t.lexer.lineno += t.value.count("\n") def t_error(self, t): msg = "Illegal character %s" % repr(t.value[0]) self._error(msg, t)