• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import os.path
6import sys
7
8def _GetDirAbove(dirname):
9  """Returns the directory "above" this file containing |dirname| (which must
10  also be "above" this file)."""
11  path = os.path.abspath(__file__)
12  while True:
13    path, tail = os.path.split(path)
14    assert tail
15    if tail == dirname:
16      return path
17
18from ply.lex import TOKEN
19
20from ..error import Error
21
22
23class LexError(Error):
24  """Class for errors from the lexer."""
25
26  def __init__(self, filename, message, lineno):
27    Error.__init__(self, filename, message, lineno=lineno)
28
29
30# We have methods which look like they could be functions:
31# pylint: disable=R0201
32class Lexer(object):
33
34  def __init__(self, filename):
35    self.filename = filename
36
37  ######################--   PRIVATE   --######################
38
39  ##
40  ## Internal auxiliary methods
41  ##
42  def _error(self, msg, token):
43    raise LexError(self.filename, msg, token.lineno)
44
45  ##
46  ## Reserved keywords
47  ##
48  keywords = (
49    'HANDLE',
50
51    'IMPORT',
52    'MODULE',
53    'STRUCT',
54    'UNION',
55    'INTERFACE',
56    'ENUM',
57    'CONST',
58    'TRUE',
59    'FALSE',
60    'DEFAULT',
61    'ARRAY',
62    'MAP',
63    'ASSOCIATED'
64  )
65
66  keyword_map = {}
67  for keyword in keywords:
68    keyword_map[keyword.lower()] = keyword
69
70  ##
71  ## All the tokens recognized by the lexer
72  ##
73  tokens = keywords + (
74    # Identifiers
75    'NAME',
76
77    # Constants
78    'ORDINAL',
79    'INT_CONST_DEC', 'INT_CONST_HEX',
80    'FLOAT_CONST',
81
82    # String literals
83    'STRING_LITERAL',
84
85    # Operators
86    'MINUS',
87    'PLUS',
88    'AMP',
89    'QSTN',
90
91    # Assignment
92    'EQUALS',
93
94    # Request / response
95    'RESPONSE',
96
97    # Delimiters
98    'LPAREN', 'RPAREN',         # ( )
99    'LBRACKET', 'RBRACKET',     # [ ]
100    'LBRACE', 'RBRACE',         # { }
101    'LANGLE', 'RANGLE',         # < >
102    'SEMI',                     # ;
103    'COMMA', 'DOT'              # , .
104  )
105
106  ##
107  ## Regexes for use in tokens
108  ##
109
110  # valid C identifiers (K&R2: A.2.3)
111  identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
112
113  hex_prefix = '0[xX]'
114  hex_digits = '[0-9a-fA-F]+'
115
116  # integer constants (K&R2: A.2.5.1)
117  decimal_constant = '0|([1-9][0-9]*)'
118  hex_constant = hex_prefix+hex_digits
119  # Don't allow octal constants (even invalid octal).
120  octal_constant_disallowed = '0[0-9]+'
121
122  # character constants (K&R2: A.2.5.2)
123  # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
124  # directives with Windows paths as filenames (..\..\dir\file)
125  # For the same reason, decimal_escape allows all digit sequences. We want to
126  # parse all correct code, even if it means to sometimes parse incorrect
127  # code.
128  #
129  simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
130  decimal_escape = r"""(\d+)"""
131  hex_escape = r"""(x[0-9a-fA-F]+)"""
132  bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
133
134  escape_sequence = \
135      r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
136
137  # string literals (K&R2: A.2.6)
138  string_char = r"""([^"\\\n]|"""+escape_sequence+')'
139  string_literal = '"'+string_char+'*"'
140  bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
141
142  # floating constants (K&R2: A.2.5.3)
143  exponent_part = r"""([eE][-+]?[0-9]+)"""
144  fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
145  floating_constant = \
146      '(((('+fractional_constant+')'+ \
147      exponent_part+'?)|([0-9]+'+exponent_part+')))'
148
149  # Ordinals
150  ordinal = r'@[0-9]+'
151  missing_ordinal_value = r'@'
152  # Don't allow ordinal values in octal (even invalid octal, like 09) or
153  # hexadecimal.
154  octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'
155
156  ##
157  ## Rules for the normal state
158  ##
159  t_ignore = ' \t\r'
160
161  # Newlines
162  def t_NEWLINE(self, t):
163    r'\n+'
164    t.lexer.lineno += len(t.value)
165
166  # Operators
167  t_MINUS             = r'-'
168  t_PLUS              = r'\+'
169  t_AMP               = r'&'
170  t_QSTN              = r'\?'
171
172  # =
173  t_EQUALS            = r'='
174
175  # =>
176  t_RESPONSE          = r'=>'
177
178  # Delimiters
179  t_LPAREN            = r'\('
180  t_RPAREN            = r'\)'
181  t_LBRACKET          = r'\['
182  t_RBRACKET          = r'\]'
183  t_LBRACE            = r'\{'
184  t_RBRACE            = r'\}'
185  t_LANGLE            = r'<'
186  t_RANGLE            = r'>'
187  t_COMMA             = r','
188  t_DOT               = r'\.'
189  t_SEMI              = r';'
190
191  t_STRING_LITERAL    = string_literal
192
193  # The following floating and integer constants are defined as
194  # functions to impose a strict order (otherwise, decimal
195  # is placed before the others because its regex is longer,
196  # and this is bad)
197  #
198  @TOKEN(floating_constant)
199  def t_FLOAT_CONST(self, t):
200    return t
201
202  @TOKEN(hex_constant)
203  def t_INT_CONST_HEX(self, t):
204    return t
205
206  @TOKEN(octal_constant_disallowed)
207  def t_OCTAL_CONSTANT_DISALLOWED(self, t):
208    msg = "Octal values not allowed"
209    self._error(msg, t)
210
211  @TOKEN(decimal_constant)
212  def t_INT_CONST_DEC(self, t):
213    return t
214
215  # unmatched string literals are caught by the preprocessor
216
217  @TOKEN(bad_string_literal)
218  def t_BAD_STRING_LITERAL(self, t):
219    msg = "String contains invalid escape code"
220    self._error(msg, t)
221
222  # Handle ordinal-related tokens in the right order:
223  @TOKEN(octal_or_hex_ordinal_disallowed)
224  def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
225    msg = "Octal and hexadecimal ordinal values not allowed"
226    self._error(msg, t)
227
228  @TOKEN(ordinal)
229  def t_ORDINAL(self, t):
230    return t
231
232  @TOKEN(missing_ordinal_value)
233  def t_BAD_ORDINAL(self, t):
234    msg = "Missing ordinal value"
235    self._error(msg, t)
236
237  @TOKEN(identifier)
238  def t_NAME(self, t):
239    t.type = self.keyword_map.get(t.value, "NAME")
240    return t
241
242  # Ignore C and C++ style comments
243  def t_COMMENT(self, t):
244    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
245    t.lexer.lineno += t.value.count("\n")
246
247  def t_error(self, t):
248    msg = "Illegal character %s" % repr(t.value[0])
249    self._error(msg, t)
250