• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import imp
6import os.path
7import sys
8
9def _GetDirAbove(dirname):
10  """Returns the directory "above" this file containing |dirname| (which must
11  also be "above" this file)."""
12  path = os.path.abspath(__file__)
13  while True:
14    path, tail = os.path.split(path)
15    assert tail
16    if tail == dirname:
17      return path
18
19try:
20  imp.find_module("ply")
21except ImportError:
22  sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
23from ply.lex import TOKEN
24
25from ..error import Error
26
27
28class LexError(Error):
29  """Class for errors from the lexer."""
30
31  def __init__(self, filename, message, lineno):
32    Error.__init__(self, filename, message, lineno=lineno)
33
34
35# We have methods which look like they could be functions:
36# pylint: disable=R0201
37class Lexer(object):
38
39  def __init__(self, filename):
40    self.filename = filename
41
42  ######################--   PRIVATE   --######################
43
44  ##
45  ## Internal auxiliary methods
46  ##
47  def _error(self, msg, token):
48    raise LexError(self.filename, msg, token.lineno)
49
50  ##
51  ## Reserved keywords
52  ##
53  keywords = (
54    'HANDLE',
55
56    'IMPORT',
57    'MODULE',
58    'STRUCT',
59    'UNION',
60    'INTERFACE',
61    'ENUM',
62    'CONST',
63    'TRUE',
64    'FALSE',
65    'DEFAULT',
66    'ARRAY',
67    'MAP',
68    'ASSOCIATED'
69  )
70
71  keyword_map = {}
72  for keyword in keywords:
73    keyword_map[keyword.lower()] = keyword
74
75  ##
76  ## All the tokens recognized by the lexer
77  ##
78  tokens = keywords + (
79    # Identifiers
80    'NAME',
81
82    # Constants
83    'ORDINAL',
84    'INT_CONST_DEC', 'INT_CONST_HEX',
85    'FLOAT_CONST',
86
87    # String literals
88    'STRING_LITERAL',
89
90    # Operators
91    'MINUS',
92    'PLUS',
93    'AMP',
94    'QSTN',
95
96    # Assignment
97    'EQUALS',
98
99    # Request / response
100    'RESPONSE',
101
102    # Delimiters
103    'LPAREN', 'RPAREN',         # ( )
104    'LBRACKET', 'RBRACKET',     # [ ]
105    'LBRACE', 'RBRACE',         # { }
106    'LANGLE', 'RANGLE',         # < >
107    'SEMI',                     # ;
108    'COMMA', 'DOT'              # , .
109  )
110
111  ##
112  ## Regexes for use in tokens
113  ##
114
115  # valid C identifiers (K&R2: A.2.3)
116  identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
117
118  hex_prefix = '0[xX]'
119  hex_digits = '[0-9a-fA-F]+'
120
121  # integer constants (K&R2: A.2.5.1)
122  decimal_constant = '0|([1-9][0-9]*)'
123  hex_constant = hex_prefix+hex_digits
124  # Don't allow octal constants (even invalid octal).
125  octal_constant_disallowed = '0[0-9]+'
126
127  # character constants (K&R2: A.2.5.2)
128  # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
129  # directives with Windows paths as filenames (..\..\dir\file)
130  # For the same reason, decimal_escape allows all digit sequences. We want to
131  # parse all correct code, even if it means to sometimes parse incorrect
132  # code.
133  #
134  simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
135  decimal_escape = r"""(\d+)"""
136  hex_escape = r"""(x[0-9a-fA-F]+)"""
137  bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
138
139  escape_sequence = \
140      r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
141
142  # string literals (K&R2: A.2.6)
143  string_char = r"""([^"\\\n]|"""+escape_sequence+')'
144  string_literal = '"'+string_char+'*"'
145  bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
146
147  # floating constants (K&R2: A.2.5.3)
148  exponent_part = r"""([eE][-+]?[0-9]+)"""
149  fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
150  floating_constant = \
151      '(((('+fractional_constant+')'+ \
152      exponent_part+'?)|([0-9]+'+exponent_part+')))'
153
154  # Ordinals
155  ordinal = r'@[0-9]+'
156  missing_ordinal_value = r'@'
157  # Don't allow ordinal values in octal (even invalid octal, like 09) or
158  # hexadecimal.
159  octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'
160
161  ##
162  ## Rules for the normal state
163  ##
164  t_ignore = ' \t\r'
165
166  # Newlines
167  def t_NEWLINE(self, t):
168    r'\n+'
169    t.lexer.lineno += len(t.value)
170
171  # Operators
172  t_MINUS             = r'-'
173  t_PLUS              = r'\+'
174  t_AMP               = r'&'
175  t_QSTN              = r'\?'
176
177  # =
178  t_EQUALS            = r'='
179
180  # =>
181  t_RESPONSE          = r'=>'
182
183  # Delimiters
184  t_LPAREN            = r'\('
185  t_RPAREN            = r'\)'
186  t_LBRACKET          = r'\['
187  t_RBRACKET          = r'\]'
188  t_LBRACE            = r'\{'
189  t_RBRACE            = r'\}'
190  t_LANGLE            = r'<'
191  t_RANGLE            = r'>'
192  t_COMMA             = r','
193  t_DOT               = r'\.'
194  t_SEMI              = r';'
195
196  t_STRING_LITERAL    = string_literal
197
198  # The following floating and integer constants are defined as
199  # functions to impose a strict order (otherwise, decimal
200  # is placed before the others because its regex is longer,
201  # and this is bad)
202  #
203  @TOKEN(floating_constant)
204  def t_FLOAT_CONST(self, t):
205    return t
206
207  @TOKEN(hex_constant)
208  def t_INT_CONST_HEX(self, t):
209    return t
210
211  @TOKEN(octal_constant_disallowed)
212  def t_OCTAL_CONSTANT_DISALLOWED(self, t):
213    msg = "Octal values not allowed"
214    self._error(msg, t)
215
216  @TOKEN(decimal_constant)
217  def t_INT_CONST_DEC(self, t):
218    return t
219
220  # unmatched string literals are caught by the preprocessor
221
222  @TOKEN(bad_string_literal)
223  def t_BAD_STRING_LITERAL(self, t):
224    msg = "String contains invalid escape code"
225    self._error(msg, t)
226
227  # Handle ordinal-related tokens in the right order:
228  @TOKEN(octal_or_hex_ordinal_disallowed)
229  def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
230    msg = "Octal and hexadecimal ordinal values not allowed"
231    self._error(msg, t)
232
233  @TOKEN(ordinal)
234  def t_ORDINAL(self, t):
235    return t
236
237  @TOKEN(missing_ordinal_value)
238  def t_BAD_ORDINAL(self, t):
239    msg = "Missing ordinal value"
240    self._error(msg, t)
241
242  @TOKEN(identifier)
243  def t_NAME(self, t):
244    t.type = self.keyword_map.get(t.value, "NAME")
245    return t
246
247  # Ignore C and C++ style comments
248  def t_COMMENT(self, t):
249    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
250    t.lexer.lineno += t.value.count("\n")
251
252  def t_error(self, t):
253    msg = "Illegal character %s" % repr(t.value[0])
254    self._error(msg, t)
255