1import re 2import sys 3import unittest 4 5sys.path.insert(0, '..') 6from pycparser.c_lexer import CLexer 7 8 9def token_list(clex): 10 return list(iter(clex.token, None)) 11 12 13def token_types(clex): 14 return [i.type for i in token_list(clex)] 15 16 17class TestCLexerNoErrors(unittest.TestCase): 18 """ Test lexing of strings that are not supposed to cause 19 errors. Therefore, the error_func passed to the lexer 20 raises an exception. 21 """ 22 def error_func(self, msg, line, column): 23 self.fail(msg) 24 25 def on_lbrace_func(self): 26 pass 27 28 def on_rbrace_func(self): 29 pass 30 31 def type_lookup_func(self, typ): 32 if typ.startswith('mytype'): 33 return True 34 else: 35 return False 36 37 def setUp(self): 38 self.clex = CLexer(self.error_func, lambda: None, lambda: None, 39 self.type_lookup_func) 40 self.clex.build(optimize=False) 41 42 def assertTokensTypes(self, str, types): 43 self.clex.input(str) 44 self.assertEqual(token_types(self.clex), types) 45 46 def test_trivial_tokens(self): 47 self.assertTokensTypes('1', ['INT_CONST_DEC']) 48 self.assertTokensTypes('-', ['MINUS']) 49 self.assertTokensTypes('volatile', ['VOLATILE']) 50 self.assertTokensTypes('...', ['ELLIPSIS']) 51 self.assertTokensTypes('++', ['PLUSPLUS']) 52 self.assertTokensTypes('case int', ['CASE', 'INT']) 53 self.assertTokensTypes('caseint', ['ID']) 54 self.assertTokensTypes('$dollar cent$', ['ID', 'ID']) 55 self.assertTokensTypes('i ^= 1;', ['ID', 'XOREQUAL', 'INT_CONST_DEC', 'SEMI']) 56 57 def test_id_typeid(self): 58 self.assertTokensTypes('myt', ['ID']) 59 self.assertTokensTypes('mytype', ['TYPEID']) 60 self.assertTokensTypes('mytype6 var', ['TYPEID', 'ID']) 61 62 def test_integer_constants(self): 63 self.assertTokensTypes('12', ['INT_CONST_DEC']) 64 self.assertTokensTypes('12u', ['INT_CONST_DEC']) 65 self.assertTokensTypes('12l', ['INT_CONST_DEC']) 66 self.assertTokensTypes('199872Ul', ['INT_CONST_DEC']) 67 self.assertTokensTypes('199872lU', ['INT_CONST_DEC']) 68 self.assertTokensTypes('199872LL', ['INT_CONST_DEC']) 69 self.assertTokensTypes('199872ull', ['INT_CONST_DEC']) 70 self.assertTokensTypes('199872llu', ['INT_CONST_DEC']) 71 self.assertTokensTypes('1009843200000uLL', ['INT_CONST_DEC']) 72 self.assertTokensTypes('1009843200000LLu', ['INT_CONST_DEC']) 73 74 self.assertTokensTypes('077', ['INT_CONST_OCT']) 75 self.assertTokensTypes('0123456L', ['INT_CONST_OCT']) 76 77 self.assertTokensTypes('0xf7', ['INT_CONST_HEX']) 78 self.assertTokensTypes('0b110', ['INT_CONST_BIN']) 79 self.assertTokensTypes('0x01202AAbbf7Ul', ['INT_CONST_HEX']) 80 self.assertTokensTypes("'12'", ['INT_CONST_CHAR']) 81 self.assertTokensTypes("'123'", ['INT_CONST_CHAR']) 82 self.assertTokensTypes("'1AB4'", ['INT_CONST_CHAR']) 83 self.assertTokensTypes(r"'1A\n4'", ['INT_CONST_CHAR']) 84 85 # no 0 before x, so ID catches it 86 self.assertTokensTypes('xf7', ['ID']) 87 88 # - is MINUS, the rest a constnant 89 self.assertTokensTypes('-1', ['MINUS', 'INT_CONST_DEC']) 90 91 def test_special_names(self): 92 self.assertTokensTypes('sizeof offsetof', ['SIZEOF', 'OFFSETOF']) 93 94 def test_floating_constants(self): 95 self.assertTokensTypes('1.5f', ['FLOAT_CONST']) 96 self.assertTokensTypes('01.5', ['FLOAT_CONST']) 97 self.assertTokensTypes('.15L', ['FLOAT_CONST']) 98 self.assertTokensTypes('0.', ['FLOAT_CONST']) 99 100 # but just a period is a period 101 self.assertTokensTypes('.', ['PERIOD']) 102 103 self.assertTokensTypes('3.3e-3', ['FLOAT_CONST']) 104 self.assertTokensTypes('.7e25L', ['FLOAT_CONST']) 105 self.assertTokensTypes('6.e+125f', ['FLOAT_CONST']) 106 self.assertTokensTypes('666e666', ['FLOAT_CONST']) 107 self.assertTokensTypes('00666e+3', ['FLOAT_CONST']) 108 109 # but this is a hex integer + 3 110 self.assertTokensTypes('0x0666e+3', ['INT_CONST_HEX', 'PLUS', 'INT_CONST_DEC']) 111 112 def test_hexadecimal_floating_constants(self): 113 self.assertTokensTypes('0xDE.488641p0', ['HEX_FLOAT_CONST']) 114 self.assertTokensTypes('0x.488641p0', ['HEX_FLOAT_CONST']) 115 self.assertTokensTypes('0X12.P0', ['HEX_FLOAT_CONST']) 116 117 def test_char_constants(self): 118 self.assertTokensTypes(r"""'x'""", ['CHAR_CONST']) 119 self.assertTokensTypes(r"""L'x'""", ['WCHAR_CONST']) 120 self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST']) 121 self.assertTokensTypes(r"""'\''""", ['CHAR_CONST']) 122 self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST']) 123 self.assertTokensTypes(r"""'\0'""", ['CHAR_CONST']) 124 self.assertTokensTypes(r"""'\012'""", ['CHAR_CONST']) 125 self.assertTokensTypes(r"""'\x2f'""", ['CHAR_CONST']) 126 self.assertTokensTypes(r"""'\x2f12'""", ['CHAR_CONST']) 127 self.assertTokensTypes(r"""L'\xaf'""", ['WCHAR_CONST']) 128 129 def test_on_rbrace_lbrace(self): 130 braces = [] 131 def on_lbrace(): 132 braces.append('{') 133 def on_rbrace(): 134 braces.append('}') 135 clex = CLexer(self.error_func, on_lbrace, on_rbrace, 136 self.type_lookup_func) 137 clex.build(optimize=False) 138 clex.input('hello { there } } and again }}{') 139 token_list(clex) 140 self.assertEqual(braces, ['{', '}', '}', '}', '}', '{']) 141 142 def test_string_literal(self): 143 self.assertTokensTypes('"a string"', ['STRING_LITERAL']) 144 self.assertTokensTypes('L"ing"', ['WSTRING_LITERAL']) 145 self.assertTokensTypes( 146 '"i am a string too \t"', 147 ['STRING_LITERAL']) 148 self.assertTokensTypes( 149 r'''"esc\ape \"\'\? \0234 chars \rule"''', 150 ['STRING_LITERAL']) 151 self.assertTokensTypes( 152 r'''"hello 'joe' wanna give it a \"go\"?"''', 153 ['STRING_LITERAL']) 154 self.assertTokensTypes( 155 '"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', 156 ['STRING_LITERAL']) 157 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 158 # directives with Windows paths as filenames (..\..\dir\file) 159 self.assertTokensTypes( 160 r'"\x"', 161 ['STRING_LITERAL']) 162 self.assertTokensTypes( 163 r'"\a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z\A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z"', 164 ['STRING_LITERAL']) 165 self.assertTokensTypes( 166 r'"C:\x\fa\x1e\xited"', 167 ['STRING_LITERAL']) 168 # The lexer is permissive and allows decimal escapes (not just octal) 169 self.assertTokensTypes( 170 '"jx\9"', 171 ['STRING_LITERAL']) 172 self.assertTokensTypes( 173 '"fo\9999999"', 174 ['STRING_LITERAL']) 175 176 def test_mess(self): 177 self.assertTokensTypes( 178 r'[{}]()', 179 ['LBRACKET', 180 'LBRACE', 'RBRACE', 181 'RBRACKET', 182 'LPAREN', 'RPAREN']) 183 184 self.assertTokensTypes( 185 r'()||!C&~Z?J', 186 ['LPAREN', 'RPAREN', 187 'LOR', 188 'LNOT', 'ID', 189 'AND', 190 'NOT', 'ID', 191 'CONDOP', 'ID']) 192 193 self.assertTokensTypes( 194 r'+-*/%|||&&&^><>=<===!=', 195 ['PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 196 'LOR', 'OR', 197 'LAND', 'AND', 198 'XOR', 199 'GT', 'LT', 'GE', 'LE', 'EQ', 'NE']) 200 201 self.assertTokensTypes( 202 r'++--->?.,;:', 203 ['PLUSPLUS', 'MINUSMINUS', 204 'ARROW', 'CONDOP', 205 'PERIOD', 'COMMA', 'SEMI', 'COLON']) 206 207 def test_exprs(self): 208 self.assertTokensTypes( 209 'bb-cc', 210 ['ID', 'MINUS', 'ID']) 211 212 self.assertTokensTypes( 213 'foo & 0xFF', 214 ['ID', 'AND', 'INT_CONST_HEX']) 215 216 self.assertTokensTypes( 217 '(2+k) * 62', 218 ['LPAREN', 'INT_CONST_DEC', 'PLUS', 'ID', 219 'RPAREN', 'TIMES', 'INT_CONST_DEC'],) 220 221 self.assertTokensTypes( 222 'x | y >> z', 223 ['ID', 'OR', 'ID', 'RSHIFT', 'ID']) 224 225 self.assertTokensTypes( 226 'x <<= z << 5', 227 ['ID', 'LSHIFTEQUAL', 'ID', 'LSHIFT', 'INT_CONST_DEC']) 228 229 self.assertTokensTypes( 230 'x = y > 0 ? y : -6', 231 ['ID', 'EQUALS', 232 'ID', 'GT', 'INT_CONST_OCT', 233 'CONDOP', 234 'ID', 235 'COLON', 236 'MINUS', 'INT_CONST_DEC']) 237 238 self.assertTokensTypes( 239 'a+++b', 240 ['ID', 'PLUSPLUS', 'PLUS', 'ID']) 241 242 def test_statements(self): 243 self.assertTokensTypes( 244 'for (int i = 0; i < n; ++i)', 245 ['FOR', 'LPAREN', 246 'INT', 'ID', 'EQUALS', 'INT_CONST_OCT', 'SEMI', 247 'ID', 'LT', 'ID', 'SEMI', 248 'PLUSPLUS', 'ID', 249 'RPAREN']) 250 251 self.assertTokensTypes( 252 'self: goto self;', 253 ['ID', 'COLON', 'GOTO', 'ID', 'SEMI']) 254 255 self.assertTokensTypes( 256 """ switch (typ) 257 { 258 case TYPE_ID: 259 m = 5; 260 break; 261 default: 262 m = 8; 263 }""", 264 ['SWITCH', 'LPAREN', 'ID', 'RPAREN', 265 'LBRACE', 266 'CASE', 'ID', 'COLON', 267 'ID', 'EQUALS', 'INT_CONST_DEC', 'SEMI', 268 'BREAK', 'SEMI', 269 'DEFAULT', 'COLON', 270 'ID', 'EQUALS', 'INT_CONST_DEC', 'SEMI', 271 'RBRACE']) 272 273 def test_preprocessor_line(self): 274 self.assertTokensTypes('#abracadabra', ['PPHASH', 'ID']) 275 276 str = r""" 277 546 278 #line 66 "kwas\df.h" 279 id 4 280 dsf 281 # 9 282 armo 283 #line 10 "..\~..\test.h" 284 tok1 285 #line 99999 "include/me.h" 286 tok2 287 """ 288 289 #~ self.clex.filename 290 self.clex.input(str) 291 self.clex.reset_lineno() 292 293 t1 = self.clex.token() 294 self.assertEqual(t1.type, 'INT_CONST_DEC') 295 self.assertEqual(t1.lineno, 2) 296 297 t2 = self.clex.token() 298 self.assertEqual(t2.type, 'ID') 299 self.assertEqual(t2.value, 'id') 300 self.assertEqual(t2.lineno, 66) 301 self.assertEqual(self.clex.filename, r'kwas\df.h') 302 303 for i in range(3): 304 t = self.clex.token() 305 306 self.assertEqual(t.type, 'ID') 307 self.assertEqual(t.value, 'armo') 308 self.assertEqual(t.lineno, 9) 309 self.assertEqual(self.clex.filename, r'kwas\df.h') 310 311 t4 = self.clex.token() 312 self.assertEqual(t4.type, 'ID') 313 self.assertEqual(t4.value, 'tok1') 314 self.assertEqual(t4.lineno, 10) 315 self.assertEqual(self.clex.filename, r'..\~..\test.h') 316 317 t5 = self.clex.token() 318 self.assertEqual(t5.type, 'ID') 319 self.assertEqual(t5.value, 'tok2') 320 self.assertEqual(t5.lineno, 99999) 321 self.assertEqual(self.clex.filename, r'include/me.h') 322 323 def test_preprocessor_line_funny(self): 324 str = r''' 325 #line 10 "..\6\joe.h" 326 10 327 ''' 328 self.clex.input(str) 329 self.clex.reset_lineno() 330 331 t1 = self.clex.token() 332 self.assertEqual(t1.type, 'INT_CONST_DEC') 333 self.assertEqual(t1.lineno, 10) 334 self.assertEqual(self.clex.filename, r'..\6\joe.h') 335 336 337 def test_preprocessor_pragma(self): 338 str = ''' 339 42 340 #pragma 341 #pragma helo me 342 #pragma once 343 # pragma omp parallel private(th_id) 344 #\tpragma {pack: 2, smack: 3} 345 #pragma <includeme.h> "nowit.h" 346 #pragma "string" 347 #pragma somestring="some_other_string" 348 #pragma id 124124 and numbers 0235495 349 59 350 ''' 351 # Check that pragmas are tokenized, including trailing string 352 self.clex.input(str) 353 self.clex.reset_lineno() 354 355 t1 = self.clex.token() 356 self.assertEqual(t1.type, 'INT_CONST_DEC') 357 358 t2 = self.clex.token() 359 self.assertEqual(t2.type, 'PPPRAGMA') 360 361 t3 = self.clex.token() 362 self.assertEqual(t3.type, 'PPPRAGMA') 363 364 t4 = self.clex.token() 365 self.assertEqual(t4.type, 'PPPRAGMASTR') 366 self.assertEqual(t4.value, 'helo me') 367 368 for i in range(3): 369 t = self.clex.token() 370 371 t5 = self.clex.token() 372 self.assertEqual(t5.type, 'PPPRAGMASTR') 373 self.assertEqual(t5.value, 'omp parallel private(th_id)') 374 375 for i in range(5): 376 ta = self.clex.token() 377 self.assertEqual(ta.type, 'PPPRAGMA') 378 tb = self.clex.token() 379 self.assertEqual(tb.type, 'PPPRAGMASTR') 380 381 t6 = self.clex.token() 382 self.assertEqual(t6.type, 'INT_CONST_DEC') 383 self.assertEqual(t6.lineno, 12) 384 385 386 387# Keeps all the errors the lexer spits in one place, to allow 388# easier modification if the error syntax changes. 389# 390ERR_ILLEGAL_CHAR = 'Illegal character' 391ERR_OCTAL = 'Invalid octal constant' 392ERR_UNMATCHED_QUOTE = 'Unmatched \'' 393ERR_INVALID_CCONST = 'Invalid char constant' 394ERR_STRING_ESCAPE = 'String contains invalid escape' 395 396ERR_FILENAME_BEFORE_LINE = 'filename before line' 397ERR_LINENUM_MISSING = 'line number missing' 398ERR_INVALID_LINE_DIRECTIVE = 'invalid #line directive' 399 400 401class TestCLexerErrors(unittest.TestCase): 402 """ Test lexing of erroneous strings. 403 Works by passing an error functions that saves the error 404 in an attribute for later perusal. 405 """ 406 def error_func(self, msg, line, column): 407 self.error = msg 408 409 def on_lbrace_func(self): 410 pass 411 412 def on_rbrace_func(self): 413 pass 414 415 def type_lookup_func(self, typ): 416 return False 417 418 def setUp(self): 419 self.clex = CLexer(self.error_func, self.on_lbrace_func, 420 self.on_rbrace_func, self.type_lookup_func) 421 self.clex.build(optimize=False) 422 self.error = "" 423 424 def assertLexerError(self, str, error_like): 425 # feed the string to the lexer 426 self.clex.input(str) 427 428 # Pulls all tokens from the string. Errors will 429 # be written into self.error by the error_func 430 # callback 431 # 432 token_types(self.clex) 433 434 # compare the error to the expected 435 self.assertTrue(re.search(error_like, self.error), 436 "\nExpected error matching: %s\nGot: %s" % 437 (error_like, self.error)) 438 439 # clear last error, for the sake of subsequent invocations 440 self.error = "" 441 442 def test_trivial_tokens(self): 443 self.assertLexerError('@', ERR_ILLEGAL_CHAR) 444 self.assertLexerError('`', ERR_ILLEGAL_CHAR) 445 self.assertLexerError('\\', ERR_ILLEGAL_CHAR) 446 447 def test_integer_constants(self): 448 self.assertLexerError('029', ERR_OCTAL) 449 self.assertLexerError('012345678', ERR_OCTAL) 450 451 def test_char_constants(self): 452 self.assertLexerError("'", ERR_UNMATCHED_QUOTE) 453 self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE) 454 self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE) 455 456 self.assertLexerError(r"'123\12a'", ERR_INVALID_CCONST) 457 self.assertLexerError(r"'123\xabg'", ERR_INVALID_CCONST) 458 self.assertLexerError("''", ERR_INVALID_CCONST) 459 self.assertLexerError("'abcjx'", ERR_INVALID_CCONST) 460 self.assertLexerError(r"'\*'", ERR_INVALID_CCONST) 461 462 def test_string_literals(self): 463 self.assertLexerError(r'"jx\`"', ERR_STRING_ESCAPE) 464 self.assertLexerError(r'"hekllo\* on ix"', ERR_STRING_ESCAPE) 465 self.assertLexerError(r'L"hekllo\* on ix"', ERR_STRING_ESCAPE) 466 # Should not suffer from slow backtracking 467 self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ERR_STRING_ESCAPE) 468 self.assertLexerError(r'"\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\x23\`\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23"', ERR_STRING_ESCAPE) 469 # Should not suffer from slow backtracking when there's no end quote 470 self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\12\123456', ERR_ILLEGAL_CHAR) 471 self.assertLexerError(r'"\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\`\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x2\x23456', ERR_ILLEGAL_CHAR) 472 473 def test_preprocessor(self): 474 self.assertLexerError('#line "ka"', ERR_FILENAME_BEFORE_LINE) 475 self.assertLexerError('#line df', ERR_INVALID_LINE_DIRECTIVE) 476 self.assertLexerError('#line \n', ERR_LINENUM_MISSING) 477 478 479if __name__ == '__main__': 480 unittest.main() 481