1# Copyright 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import os.path 6import sys 7 8def _GetDirAbove(dirname): 9 """Returns the directory "above" this file containing |dirname| (which must 10 also be "above" this file).""" 11 path = os.path.abspath(__file__) 12 while True: 13 path, tail = os.path.split(path) 14 assert tail 15 if tail == dirname: 16 return path 17 18from ply.lex import TOKEN 19 20from ..error import Error 21 22 23class LexError(Error): 24 """Class for errors from the lexer.""" 25 26 def __init__(self, filename, message, lineno): 27 Error.__init__(self, filename, message, lineno=lineno) 28 29 30# We have methods which look like they could be functions: 31# pylint: disable=R0201 32class Lexer(object): 33 34 def __init__(self, filename): 35 self.filename = filename 36 37 ######################-- PRIVATE --###################### 38 39 ## 40 ## Internal auxiliary methods 41 ## 42 def _error(self, msg, token): 43 raise LexError(self.filename, msg, token.lineno) 44 45 ## 46 ## Reserved keywords 47 ## 48 keywords = ( 49 'HANDLE', 50 51 'IMPORT', 52 'MODULE', 53 'STRUCT', 54 'UNION', 55 'INTERFACE', 56 'ENUM', 57 'CONST', 58 'TRUE', 59 'FALSE', 60 'DEFAULT', 61 'ARRAY', 62 'MAP', 63 'ASSOCIATED' 64 ) 65 66 keyword_map = {} 67 for keyword in keywords: 68 keyword_map[keyword.lower()] = keyword 69 70 ## 71 ## All the tokens recognized by the lexer 72 ## 73 tokens = keywords + ( 74 # Identifiers 75 'NAME', 76 77 # Constants 78 'ORDINAL', 79 'INT_CONST_DEC', 'INT_CONST_HEX', 80 'FLOAT_CONST', 81 82 # String literals 83 'STRING_LITERAL', 84 85 # Operators 86 'MINUS', 87 'PLUS', 88 'AMP', 89 'QSTN', 90 91 # Assignment 92 'EQUALS', 93 94 # Request / response 95 'RESPONSE', 96 97 # Delimiters 98 'LPAREN', 'RPAREN', # ( ) 99 'LBRACKET', 'RBRACKET', # [ ] 100 'LBRACE', 'RBRACE', # { } 101 'LANGLE', 'RANGLE', # < > 102 'SEMI', # ; 103 'COMMA', 'DOT' # , . 104 ) 105 106 ## 107 ## Regexes for use in tokens 108 ## 109 110 # valid C identifiers (K&R2: A.2.3) 111 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' 112 113 hex_prefix = '0[xX]' 114 hex_digits = '[0-9a-fA-F]+' 115 116 # integer constants (K&R2: A.2.5.1) 117 decimal_constant = '0|([1-9][0-9]*)' 118 hex_constant = hex_prefix+hex_digits 119 # Don't allow octal constants (even invalid octal). 120 octal_constant_disallowed = '0[0-9]+' 121 122 # character constants (K&R2: A.2.5.2) 123 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 124 # directives with Windows paths as filenames (..\..\dir\file) 125 # For the same reason, decimal_escape allows all digit sequences. We want to 126 # parse all correct code, even if it means to sometimes parse incorrect 127 # code. 128 # 129 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 130 decimal_escape = r"""(\d+)""" 131 hex_escape = r"""(x[0-9a-fA-F]+)""" 132 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 133 134 escape_sequence = \ 135 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 136 137 # string literals (K&R2: A.2.6) 138 string_char = r"""([^"\\\n]|"""+escape_sequence+')' 139 string_literal = '"'+string_char+'*"' 140 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 141 142 # floating constants (K&R2: A.2.5.3) 143 exponent_part = r"""([eE][-+]?[0-9]+)""" 144 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 145 floating_constant = \ 146 '(((('+fractional_constant+')'+ \ 147 exponent_part+'?)|([0-9]+'+exponent_part+')))' 148 149 # Ordinals 150 ordinal = r'@[0-9]+' 151 missing_ordinal_value = r'@' 152 # Don't allow ordinal values in octal (even invalid octal, like 09) or 153 # hexadecimal. 154 octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' 155 156 ## 157 ## Rules for the normal state 158 ## 159 t_ignore = ' \t\r' 160 161 # Newlines 162 def t_NEWLINE(self, t): 163 r'\n+' 164 t.lexer.lineno += len(t.value) 165 166 # Operators 167 t_MINUS = r'-' 168 t_PLUS = r'\+' 169 t_AMP = r'&' 170 t_QSTN = r'\?' 171 172 # = 173 t_EQUALS = r'=' 174 175 # => 176 t_RESPONSE = r'=>' 177 178 # Delimiters 179 t_LPAREN = r'\(' 180 t_RPAREN = r'\)' 181 t_LBRACKET = r'\[' 182 t_RBRACKET = r'\]' 183 t_LBRACE = r'\{' 184 t_RBRACE = r'\}' 185 t_LANGLE = r'<' 186 t_RANGLE = r'>' 187 t_COMMA = r',' 188 t_DOT = r'\.' 189 t_SEMI = r';' 190 191 t_STRING_LITERAL = string_literal 192 193 # The following floating and integer constants are defined as 194 # functions to impose a strict order (otherwise, decimal 195 # is placed before the others because its regex is longer, 196 # and this is bad) 197 # 198 @TOKEN(floating_constant) 199 def t_FLOAT_CONST(self, t): 200 return t 201 202 @TOKEN(hex_constant) 203 def t_INT_CONST_HEX(self, t): 204 return t 205 206 @TOKEN(octal_constant_disallowed) 207 def t_OCTAL_CONSTANT_DISALLOWED(self, t): 208 msg = "Octal values not allowed" 209 self._error(msg, t) 210 211 @TOKEN(decimal_constant) 212 def t_INT_CONST_DEC(self, t): 213 return t 214 215 # unmatched string literals are caught by the preprocessor 216 217 @TOKEN(bad_string_literal) 218 def t_BAD_STRING_LITERAL(self, t): 219 msg = "String contains invalid escape code" 220 self._error(msg, t) 221 222 # Handle ordinal-related tokens in the right order: 223 @TOKEN(octal_or_hex_ordinal_disallowed) 224 def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): 225 msg = "Octal and hexadecimal ordinal values not allowed" 226 self._error(msg, t) 227 228 @TOKEN(ordinal) 229 def t_ORDINAL(self, t): 230 return t 231 232 @TOKEN(missing_ordinal_value) 233 def t_BAD_ORDINAL(self, t): 234 msg = "Missing ordinal value" 235 self._error(msg, t) 236 237 @TOKEN(identifier) 238 def t_NAME(self, t): 239 t.type = self.keyword_map.get(t.value, "NAME") 240 return t 241 242 # Ignore C and C++ style comments 243 def t_COMMENT(self, t): 244 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 245 t.lexer.lineno += t.value.count("\n") 246 247 def t_error(self, t): 248 msg = "Illegal character %s" % repr(t.value[0]) 249 self._error(msg, t) 250