1#!/usr/bin/env python 2# Copyright (c) 2012 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6""" Lexer for PPAPI IDL """ 7 8# 9# IDL Lexer 10# 11# The lexer is uses the PLY lex library to build a tokenizer which understands 12# WebIDL tokens. 13# 14# WebIDL, and WebIDL regular expressions can be found at: 15# http://dev.w3.org/2006/webapi/WebIDL/ 16# PLY can be found at: 17# http://www.dabeaz.com/ply/ 18 19import os.path 20import re 21import sys 22 23# 24# Try to load the ply module, if not, then assume it is in the third_party 25# directory, relative to ppapi 26# 27try: 28 from ply import lex 29except: 30 module_path, module_name = os.path.split(__file__) 31 third_party = os.path.join(module_path, '..', '..', 'third_party') 32 sys.path.append(third_party) 33 from ply import lex 34 35from idl_option import GetOption, Option, ParseOptions 36 37 38Option('output', 'Generate output.') 39 40# 41# IDL Lexer 42# 43class IDLLexer(object): 44 # 'tokens' is a value required by lex which specifies the complete list 45 # of valid token types. 46 tokens = [ 47 # Symbol and keywords types 48 'COMMENT', 49 'DESCRIBE', 50 'ENUM', 51 'LABEL', 52 'SYMBOL', 53 'INLINE', 54 'INTERFACE', 55 'STRUCT', 56 'TYPEDEF', 57 'OR', 58 59 # Extra WebIDL keywords 60 'CALLBACK', 61 'DICTIONARY', 62 'OPTIONAL', 63 'STATIC', 64 65 # Invented for apps use 66 'NAMESPACE', 67 68 # Data types 69 'FLOAT', 70 'OCT', 71 'INT', 72 'HEX', 73 'STRING', 74 75 # Operators 76 'LSHIFT', 77 'RSHIFT' 78 ] 79 80 # 'keywords' is a map of string to token type. All SYMBOL tokens are 81 # matched against keywords, to determine if the token is actually a keyword. 82 keywords = { 83 'describe' : 'DESCRIBE', 84 'enum' : 'ENUM', 85 'label' : 'LABEL', 86 'interface' : 'INTERFACE', 87 'readonly' : 'READONLY', 88 'struct' : 'STRUCT', 89 'typedef' : 'TYPEDEF', 90 91 'callback' : 'CALLBACK', 92 'dictionary' : 'DICTIONARY', 93 'optional' : 'OPTIONAL', 94 'static' : 'STATIC', 95 'namespace' : 'NAMESPACE', 96 97 'or' : 'OR', 98 } 99 100 # 'literals' is a value expected by lex which specifies a list of valid 101 # literal tokens, meaning the token type and token value are identical. 102 literals = '"*.(){}[],;:=+-/~|&^?' 103 104 # Token definitions 105 # 106 # Lex assumes any value or function in the form of 't_<TYPE>' represents a 107 # regular expression where a match will emit a token of type <TYPE>. In the 108 # case of a function, the function is called when a match is made. These 109 # definitions come from WebIDL. 110 111 # 't_ignore' is a special match of items to ignore 112 t_ignore = ' \t' 113 114 # Constant values 115 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' 116 t_INT = r'-?[0-9]+[uU]?' 117 t_OCT = r'-?0[0-7]+' 118 t_HEX = r'-?0[Xx][0-9A-Fa-f]+' 119 t_LSHIFT = r'<<' 120 t_RSHIFT = r'>>' 121 122 # A line ending '\n', we use this to increment the line number 123 def t_LINE_END(self, t): 124 r'\n+' 125 self.AddLines(len(t.value)) 126 127 # We do not process escapes in the IDL strings. Strings are exclusively 128 # used for attributes, and not used as typical 'C' constants. 129 def t_STRING(self, t): 130 r'"[^"]*"' 131 t.value = t.value[1:-1] 132 self.AddLines(t.value.count('\n')) 133 return t 134 135 # A C or C++ style comment: /* xxx */ or // 136 def t_COMMENT(self, t): 137 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 138 self.AddLines(t.value.count('\n')) 139 return t 140 141 # Return a "preprocessor" inline block 142 def t_INLINE(self, t): 143 r'\#inline (.|\n)*?\#endinl.*' 144 self.AddLines(t.value.count('\n')) 145 return t 146 147 # A symbol or keyword. 148 def t_KEYWORD_SYMBOL(self, t): 149 r'_?[A-Za-z][A-Za-z_0-9]*' 150 151 # All non-keywords are assumed to be symbols 152 t.type = self.keywords.get(t.value, 'SYMBOL') 153 154 # We strip leading underscores so that you can specify symbols with the same 155 # value as a keywords (E.g. a dictionary named 'interface'). 156 if t.value[0] == '_': 157 t.value = t.value[1:] 158 return t 159 160 def t_ANY_error(self, t): 161 msg = "Unrecognized input" 162 line = self.lexobj.lineno 163 164 # If that line has not been accounted for, then we must have hit 165 # EoF, so compute the beginning of the line that caused the problem. 166 if line >= len(self.index): 167 # Find the offset in the line of the first word causing the issue 168 word = t.value.split()[0] 169 offs = self.lines[line - 1].find(word) 170 # Add the computed line's starting position 171 self.index.append(self.lexobj.lexpos - offs) 172 msg = "Unexpected EoF reached after" 173 174 pos = self.lexobj.lexpos - self.index[line] 175 file = self.lexobj.filename 176 out = self.ErrorMessage(file, line, pos, msg) 177 sys.stderr.write(out + '\n') 178 self.lex_errors += 1 179 180 181 def AddLines(self, count): 182 # Set the lexer position for the beginning of the next line. In the case 183 # of multiple lines, tokens can not exist on any of the lines except the 184 # last one, so the recorded value for previous lines are unused. We still 185 # fill the array however, to make sure the line count is correct. 186 self.lexobj.lineno += count 187 for i in range(count): 188 self.index.append(self.lexobj.lexpos) 189 190 def FileLineMsg(self, file, line, msg): 191 if file: return "%s(%d) : %s" % (file, line + 1, msg) 192 return "<BuiltIn> : %s" % msg 193 194 def SourceLine(self, file, line, pos): 195 caret = '\t^'.expandtabs(pos) 196 # We decrement the line number since the array is 0 based while the 197 # line numbers are 1 based. 198 return "%s\n%s" % (self.lines[line - 1], caret) 199 200 def ErrorMessage(self, file, line, pos, msg): 201 return "\n%s\n%s" % ( 202 self.FileLineMsg(file, line, msg), 203 self.SourceLine(file, line, pos)) 204 205 def SetData(self, filename, data): 206 # Start with line 1, not zero 207 self.lexobj.lineno = 1 208 self.lexobj.filename = filename 209 self.lines = data.split('\n') 210 self.index = [0] 211 self.lexobj.input(data) 212 self.lex_errors = 0 213 214 def __init__(self): 215 self.lexobj = lex.lex(object=self, lextab=None, optimize=0) 216 217 218 219# 220# FilesToTokens 221# 222# From a set of source file names, generate a list of tokens. 223# 224def FilesToTokens(filenames, verbose=False): 225 lexer = IDLLexer() 226 outlist = [] 227 for filename in filenames: 228 data = open(filename).read() 229 lexer.SetData(filename, data) 230 if verbose: sys.stdout.write(' Loaded %s...\n' % filename) 231 while 1: 232 t = lexer.lexobj.token() 233 if t is None: break 234 outlist.append(t) 235 return outlist 236 237 238def TokensFromText(text): 239 lexer = IDLLexer() 240 lexer.SetData('unknown', text) 241 outlist = [] 242 while 1: 243 t = lexer.lexobj.token() 244 if t is None: break 245 outlist.append(t.value) 246 return outlist 247 248# 249# TextToTokens 250# 251# From a block of text, generate a list of tokens 252# 253def TextToTokens(source): 254 lexer = IDLLexer() 255 outlist = [] 256 lexer.SetData('AUTO', source) 257 while 1: 258 t = lexer.lexobj.token() 259 if t is None: break 260 outlist.append(t.value) 261 return outlist 262 263 264# 265# TestSame 266# 267# From a set of token values, generate a new source text by joining with a 268# single space. The new source is then tokenized and compared against the 269# old set. 270# 271def TestSame(values1): 272 # Recreate the source from the tokens. We use newline instead of whitespace 273 # since the '//' and #inline regex are line sensitive. 274 text = '\n'.join(values1) 275 values2 = TextToTokens(text) 276 277 count1 = len(values1) 278 count2 = len(values2) 279 if count1 != count2: 280 print "Size mismatch original %d vs %d\n" % (count1, count2) 281 if count1 > count2: count1 = count2 282 283 for i in range(count1): 284 if values1[i] != values2[i]: 285 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i]) 286 287 if GetOption('output'): 288 sys.stdout.write('Generating original.txt and tokenized.txt\n') 289 open('original.txt', 'w').write(src1) 290 open('tokenized.txt', 'w').write(src2) 291 292 if values1 == values2: 293 sys.stdout.write('Same: Pass\n') 294 return 0 295 296 print "****************\n%s\n%s***************\n" % (src1, src2) 297 sys.stdout.write('Same: Failed\n') 298 return -1 299 300 301# 302# TestExpect 303# 304# From a set of tokens pairs, verify the type field of the second matches 305# the value of the first, so that: 306# INT 123 FLOAT 1.1 307# will generate a passing test, where the first token is the SYMBOL INT, 308# and the second token is the INT 123, third token is the SYMBOL FLOAT and 309# the fourth is the FLOAT 1.1, etc... 310def TestExpect(tokens): 311 count = len(tokens) 312 index = 0 313 errors = 0 314 while index < count: 315 type = tokens[index].value 316 token = tokens[index + 1] 317 index += 2 318 319 if type != token.type: 320 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % 321 (type, token.type, token.value)) 322 errors += 1 323 324 if not errors: 325 sys.stdout.write('Expect: Pass\n') 326 return 0 327 328 sys.stdout.write('Expect: Failed\n') 329 return -1 330 331 332def Main(args): 333 filenames = ParseOptions(args) 334 335 try: 336 tokens = FilesToTokens(filenames, GetOption('verbose')) 337 values = [tok.value for tok in tokens] 338 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n') 339 if GetOption('test'): 340 if TestSame(values): 341 return -1 342 if TestExpect(tokens): 343 return -1 344 return 0 345 346 except lex.LexError as le: 347 sys.stderr.write('%s\n' % str(le)) 348 return -1 349 350 351if __name__ == '__main__': 352 sys.exit(Main(sys.argv[1:])) 353