• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import imp
6import os.path
7import sys
8
9# Disable lint check for finding modules:
10# pylint: disable=F0401
11
12def _GetDirAbove(dirname):
13  """Returns the directory "above" this file containing |dirname| (which must
14  also be "above" this file)."""
15  path = os.path.abspath(__file__)
16  while True:
17    path, tail = os.path.split(path)
18    assert tail
19    if tail == dirname:
20      return path
21
22try:
23  imp.find_module("ply")
24except ImportError:
25  sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
26from ply.lex import TOKEN
27
28from ..error import Error
29
30
31# Disable lint check for exceptions deriving from Exception:
32# pylint: disable=W0710
33class LexError(Error):
34  """Class for errors from the lexer."""
35
36  def __init__(self, filename, message, lineno):
37    Error.__init__(self, filename, message, lineno=lineno)
38
39
40# We have methods which look like they could be functions:
41# pylint: disable=R0201
42class Lexer(object):
43
44  def __init__(self, filename):
45    self.filename = filename
46
47  ######################--   PRIVATE   --######################
48
49  ##
50  ## Internal auxiliary methods
51  ##
52  def _error(self, msg, token):
53    raise LexError(self.filename, msg, token.lineno)
54
55  ##
56  ## Reserved keywords
57  ##
58  keywords = (
59    'HANDLE',
60
61    'IMPORT',
62    'MODULE',
63    'STRUCT',
64    'INTERFACE',
65    'ENUM',
66    'CONST',
67    'TRUE',
68    'FALSE',
69    'DEFAULT',
70  )
71
72  keyword_map = {}
73  for keyword in keywords:
74    keyword_map[keyword.lower()] = keyword
75
76  ##
77  ## All the tokens recognized by the lexer
78  ##
79  tokens = keywords + (
80    # Identifiers
81    'NAME',
82
83    # Constants
84    'ORDINAL',
85    'INT_CONST_DEC', 'INT_CONST_HEX',
86    'FLOAT_CONST',
87    'CHAR_CONST',
88
89    # String literals
90    'STRING_LITERAL',
91
92    # Operators
93    'MINUS',
94    'PLUS',
95    'AMP',
96
97    # Assignment
98    'EQUALS',
99
100    # Request / response
101    'RESPONSE',
102
103    # Delimiters
104    'LPAREN', 'RPAREN',         # ( )
105    'LBRACKET', 'RBRACKET',     # [ ]
106    'LBRACE', 'RBRACE',         # { }
107    'LANGLE', 'RANGLE',         # < >
108    'SEMI',                     # ;
109    'COMMA', 'DOT'              # , .
110  )
111
112  ##
113  ## Regexes for use in tokens
114  ##
115
116  # valid C identifiers (K&R2: A.2.3)
117  identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
118
119  hex_prefix = '0[xX]'
120  hex_digits = '[0-9a-fA-F]+'
121
122  # integer constants (K&R2: A.2.5.1)
123  decimal_constant = '0|([1-9][0-9]*)'
124  hex_constant = hex_prefix+hex_digits
125  # Don't allow octal constants (even invalid octal).
126  octal_constant_disallowed = '0[0-9]+'
127
128  # character constants (K&R2: A.2.5.2)
129  # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
130  # directives with Windows paths as filenames (..\..\dir\file)
131  # For the same reason, decimal_escape allows all digit sequences. We want to
132  # parse all correct code, even if it means to sometimes parse incorrect
133  # code.
134  #
135  simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
136  decimal_escape = r"""(\d+)"""
137  hex_escape = r"""(x[0-9a-fA-F]+)"""
138  bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
139
140  escape_sequence = \
141      r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
142  cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
143  char_const = "'"+cconst_char+"'"
144  unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
145  bad_char_const = \
146      r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \
147      bad_escape+r"""[^'\n]*')"""
148
149  # string literals (K&R2: A.2.6)
150  string_char = r"""([^"\\\n]|"""+escape_sequence+')'
151  string_literal = '"'+string_char+'*"'
152  bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
153
154  # floating constants (K&R2: A.2.5.3)
155  exponent_part = r"""([eE][-+]?[0-9]+)"""
156  fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
157  floating_constant = \
158      '(((('+fractional_constant+')'+ \
159      exponent_part+'?)|([0-9]+'+exponent_part+')))'
160
161  # Ordinals
162  ordinal = r'@[0-9]+'
163  missing_ordinal_value = r'@'
164  # Don't allow ordinal values in octal (even invalid octal, like 09) or
165  # hexadecimal.
166  octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'
167
168  ##
169  ## Rules for the normal state
170  ##
171  t_ignore = ' \t\r'
172
173  # Newlines
174  def t_NEWLINE(self, t):
175    r'\n+'
176    t.lexer.lineno += len(t.value)
177
178  # Operators
179  t_MINUS             = r'-'
180  t_PLUS              = r'\+'
181  t_AMP               = r'&'
182
183  # =
184  t_EQUALS            = r'='
185
186  # =>
187  t_RESPONSE          = r'=>'
188
189  # Delimiters
190  t_LPAREN            = r'\('
191  t_RPAREN            = r'\)'
192  t_LBRACKET          = r'\['
193  t_RBRACKET          = r'\]'
194  t_LBRACE            = r'\{'
195  t_RBRACE            = r'\}'
196  t_LANGLE            = r'<'
197  t_RANGLE            = r'>'
198  t_COMMA             = r','
199  t_DOT               = r'\.'
200  t_SEMI              = r';'
201
202  t_STRING_LITERAL    = string_literal
203
204  # The following floating and integer constants are defined as
205  # functions to impose a strict order (otherwise, decimal
206  # is placed before the others because its regex is longer,
207  # and this is bad)
208  #
209  @TOKEN(floating_constant)
210  def t_FLOAT_CONST(self, t):
211    return t
212
213  @TOKEN(hex_constant)
214  def t_INT_CONST_HEX(self, t):
215    return t
216
217  @TOKEN(octal_constant_disallowed)
218  def t_OCTAL_CONSTANT_DISALLOWED(self, t):
219    msg = "Octal values not allowed"
220    self._error(msg, t)
221
222  @TOKEN(decimal_constant)
223  def t_INT_CONST_DEC(self, t):
224    return t
225
226  # Must come before bad_char_const, to prevent it from
227  # catching valid char constants as invalid
228  #
229  @TOKEN(char_const)
230  def t_CHAR_CONST(self, t):
231    return t
232
233  @TOKEN(unmatched_quote)
234  def t_UNMATCHED_QUOTE(self, t):
235    msg = "Unmatched '"
236    self._error(msg, t)
237
238  @TOKEN(bad_char_const)
239  def t_BAD_CHAR_CONST(self, t):
240    msg = "Invalid char constant %s" % t.value
241    self._error(msg, t)
242
243  # unmatched string literals are caught by the preprocessor
244
245  @TOKEN(bad_string_literal)
246  def t_BAD_STRING_LITERAL(self, t):
247    msg = "String contains invalid escape code"
248    self._error(msg, t)
249
250  # Handle ordinal-related tokens in the right order:
251  @TOKEN(octal_or_hex_ordinal_disallowed)
252  def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
253    msg = "Octal and hexadecimal ordinal values not allowed"
254    self._error(msg, t)
255
256  @TOKEN(ordinal)
257  def t_ORDINAL(self, t):
258    return t
259
260  @TOKEN(missing_ordinal_value)
261  def t_BAD_ORDINAL(self, t):
262    msg = "Missing ordinal value"
263    self._error(msg, t)
264
265  @TOKEN(identifier)
266  def t_NAME(self, t):
267    t.type = self.keyword_map.get(t.value, "NAME")
268    return t
269
270  # Ignore C and C++ style comments
271  def t_COMMENT(self, t):
272    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
273    t.lexer.lineno += t.value.count("\n")
274
275  def t_error(self, t):
276    msg = "Illegal character %s" % repr(t.value[0])
277    self._error(msg, t)
278