1# a glorified C pre-processor parser 2 3import sys, re, string 4from utils import * 5from defaults import * 6 7debugTokens = False 8debugDirectiveTokenizer = False 9debugLineParsing = False 10debugCppExpr = False 11debugOptimIf01 = False 12 13##################################################################################### 14##################################################################################### 15##### ##### 16##### C P P T O K E N S ##### 17##### ##### 18##################################################################################### 19##################################################################################### 20 21# the list of supported C-preprocessor tokens 22# plus a couple of C tokens as well 23tokEOF = "\0" 24tokLN = "\n" 25tokSTRINGIFY = "#" 26tokCONCAT = "##" 27tokLOGICAND = "&&" 28tokLOGICOR = "||" 29tokSHL = "<<" 30tokSHR = ">>" 31tokEQUAL = "==" 32tokNEQUAL = "!=" 33tokLT = "<" 34tokLTE = "<=" 35tokGT = ">" 36tokGTE = ">=" 37tokELLIPSIS = "..." 38tokSPACE = " " 39tokDEFINED = "defined" 40tokLPAREN = "(" 41tokRPAREN = ")" 42tokNOT = "!" 43tokPLUS = "+" 44tokMINUS = "-" 45tokMULTIPLY = "*" 46tokDIVIDE = "/" 47tokMODULUS = "%" 48tokBINAND = "&" 49tokBINOR = "|" 50tokBINXOR = "^" 51tokCOMMA = "," 52tokLBRACE = "{" 53tokRBRACE = "}" 54tokARROW = "->" 55tokINCREMENT = "++" 56tokDECREMENT = "--" 57tokNUMBER = "<number>" 58tokIDENT = "<ident>" 59tokSTRING = "<string>" 60 61class Token: 62 """a simple class to hold information about a given token. 63 each token has a position in the source code, as well as 64 an 'id' and a 'value'. the id is a string that identifies 65 the token's class, while the value is the string of the 66 original token itself. 67 68 for example, the tokenizer concatenates a series of spaces 69 and tabs as a single tokSPACE id, whose value if the original 70 spaces+tabs sequence.""" 71 72 def __init__(self): 73 self.id = None 74 self.value = None 75 self.lineno = 0 76 self.colno = 0 77 78 def set(self,id,val=None): 79 self.id = id 80 if val: 81 self.value = val 82 else: 83 self.value = id 84 return None 85 86 def copyFrom(self,src): 87 self.id = src.id 88 self.value = src.value 89 self.lineno = src.lineno 90 self.colno = src.colno 91 92 def __repr__(self): 93 if self.id == tokIDENT: 94 return "(ident %s)" % self.value 95 if self.id == tokNUMBER: 96 return "(number %s)" % self.value 97 if self.id == tokSTRING: 98 return "(string '%s')" % self.value 99 if self.id == tokLN: 100 return "<LN>" 101 if self.id == tokEOF: 102 return "<EOF>" 103 if self.id == tokSPACE and self.value == "\\": 104 # this corresponds to a trailing \ that was transformed into a tokSPACE 105 return "<\\>" 106 107 return self.id 108 109 def __str__(self): 110 if self.id == tokIDENT: 111 return self.value 112 if self.id == tokNUMBER: 113 return self.value 114 if self.id == tokSTRING: 115 return self.value 116 if self.id == tokEOF: 117 return "<EOF>" 118 if self.id == tokSPACE: 119 if self.value == "\\": # trailing \ 120 return "\\\n" 121 else: 122 return self.value 123 124 return self.id 125 126class BadExpectedToken(Exception): 127 def __init__(self,msg): 128 print msg 129 130##################################################################################### 131##################################################################################### 132##### ##### 133##### C P P T O K E N C U R S O R ##### 134##### ##### 135##################################################################################### 136##################################################################################### 137 138class TokenCursor: 139 """a small class to iterate over a list of Token objects""" 140 def __init__(self,tokens): 141 self.tokens = tokens 142 self.n = 0 143 self.count = len(tokens) 144 145 def set(self,n): 146 """set the current position""" 147 if n < 0: 148 n = 0 149 if n > self.count: 150 n = self.count 151 self.n = n 152 153 def peekId(self): 154 """retrieve the id of the current token""" 155 if (self.n >= self.count): 156 return None 157 return self.tokens[self.n].id 158 159 def peek(self): 160 """retrieve the current token. does not change position""" 161 if (self.n >= self.count): 162 return None 163 return self.tokens[self.n] 164 165 def skip(self): 166 """increase current token position""" 167 if (self.n < self.count): 168 self.n += 1 169 170 def skipSpaces(self): 171 """skip over all space tokens, this includes tokSPACE and tokLN""" 172 while 1: 173 tok = self.peekId() 174 if tok != tokSPACE and tok != tokLN: 175 break 176 self.skip() 177 178 def skipIfId(self,id): 179 """skip an optional token""" 180 if self.peekId() == id: 181 self.skip() 182 183 def expectId(self,id): 184 """raise an exception if the current token hasn't a given id. 185 otherwise skip over it""" 186 tok = self.peek() 187 if tok.id != id: 188 raise BadExpectedToken, "%d:%d: '%s' expected, received '%s'" % (tok.lineno, tok.colno, id, tok.id) 189 self.skip() 190 191 def remain(self): 192 """return the list of remaining tokens""" 193 return self.tokens[self.n:] 194 195 196##################################################################################### 197##################################################################################### 198##### ##### 199##### C P P T O K E N I Z E R ##### 200##### ##### 201##################################################################################### 202##################################################################################### 203 204# list of long symbols, i.e. those that take more than one characters 205cppLongSymbols = [ tokCONCAT, tokLOGICAND, tokLOGICOR, tokSHL, tokSHR, tokELLIPSIS, tokEQUAL,\ 206 tokNEQUAL, tokLTE, tokGTE, tokARROW, tokINCREMENT, tokDECREMENT ] 207 208class CppTokenizer: 209 """an abstract class used to convert some input text into a list 210 of tokens. real implementations follow and differ in the format 211 of the input text only""" 212 213 def __init__(self): 214 """initialize a new CppTokenizer object""" 215 self.eof = False # end of file reached ? 216 self.text = None # content of current line, with final \n stripped 217 self.line = 0 # number of current line 218 self.pos = 0 # current character position in current line 219 self.len = 0 # length of current line text 220 self.held = Token() 221 222 def setLineText(self,line): 223 """set the content of the (next) current line. should be called 224 by fillLineText() in derived classes""" 225 self.text = line 226 self.len = len(line) 227 self.pos = 0 228 229 def fillLineText(self): 230 """refresh the content of 'line' with a new line of input""" 231 # to be overriden 232 self.eof = True 233 234 def markPos(self,tok): 235 """mark the position of the current token in the source file""" 236 if self.eof or self.pos > self.len: 237 tok.lineno = self.line + 1 238 tok.colno = 0 239 else: 240 tok.lineno = self.line 241 tok.colno = self.pos 242 243 def peekChar(self): 244 """return the current token under the cursor without moving it""" 245 if self.eof: 246 return tokEOF 247 248 if self.pos > self.len: 249 self.pos = 0 250 self.line += 1 251 self.fillLineText() 252 if self.eof: 253 return tokEOF 254 255 if self.pos == self.len: 256 return tokLN 257 else: 258 return self.text[self.pos] 259 260 def peekNChar(self,n): 261 """try to peek the next n chars on the same line""" 262 if self.pos + n > self.len: 263 return None 264 return self.text[self.pos:self.pos+n] 265 266 def skipChar(self): 267 """increment the token cursor position""" 268 if not self.eof: 269 self.pos += 1 270 271 def skipNChars(self,n): 272 if self.pos + n <= self.len: 273 self.pos += n 274 else: 275 while n > 0: 276 self.skipChar() 277 n -= 1 278 279 def nextChar(self): 280 """retrieve the token at the current cursor position, then skip it""" 281 result = self.peekChar() 282 self.skipChar() 283 return result 284 285 def getEscape(self): 286 # try to get all characters after a backslash (\) 287 result = self.nextChar() 288 if result == "0": 289 # octal number ? 290 num = self.peekNChar(3) 291 if num != None: 292 isOctal = True 293 for d in num: 294 if not d in "01234567": 295 isOctal = False 296 break 297 if isOctal: 298 result += num 299 self.skipNChars(3) 300 elif result == "x" or result == "X": 301 # hex number ? 302 num = self.peekNChar(2) 303 if num != None: 304 isHex = True 305 for d in num: 306 if not d in "012345678abcdefABCDEF": 307 isHex = False 308 break 309 if isHex: 310 result += num 311 self.skipNChars(2) 312 elif result == "u" or result == "U": 313 # unicode char ? 314 num = self.peekNChar(4) 315 if num != None: 316 isHex = True 317 for d in num: 318 if not d in "012345678abcdefABCDEF": 319 isHex = False 320 break 321 if isHex: 322 result += num 323 self.skipNChars(4) 324 325 return result 326 327 def nextRealToken(self,tok): 328 """return next CPP token, used internally by nextToken()""" 329 c = self.nextChar() 330 if c == tokEOF or c == tokLN: 331 return tok.set(c) 332 333 if c == '/': 334 c = self.peekChar() 335 if c == '/': # C++ comment line 336 self.skipChar() 337 while 1: 338 c = self.nextChar() 339 if c == tokEOF or c == tokLN: 340 break 341 return tok.set(tokLN) 342 if c == '*': # C comment start 343 self.skipChar() 344 value = "/*" 345 prev_c = None 346 while 1: 347 c = self.nextChar() 348 if c == tokEOF: 349 #print "## EOF after '%s'" % value 350 return tok.set(tokEOF,value) 351 if c == '/' and prev_c == '*': 352 break 353 prev_c = c 354 value += c 355 356 value += "/" 357 #print "## COMMENT: '%s'" % value 358 return tok.set(tokSPACE,value) 359 c = '/' 360 361 if c.isspace(): 362 while 1: 363 c2 = self.peekChar() 364 if c2 == tokLN or not c2.isspace(): 365 break 366 c += c2 367 self.skipChar() 368 return tok.set(tokSPACE,c) 369 370 if c == '\\': 371 if debugTokens: 372 print "nextRealToken: \\ found, next token is '%s'" % repr(self.peekChar()) 373 if self.peekChar() == tokLN: # trailing \ 374 # eat the tokLN 375 self.skipChar() 376 # we replace a trailing \ by a tokSPACE whose value is 377 # simply "\\". this allows us to detect them later when 378 # needed. 379 return tok.set(tokSPACE,"\\") 380 else: 381 # treat as a single token here ? 382 c +=self.getEscape() 383 return tok.set(c) 384 385 if c == "'": # chars 386 c2 = self.nextChar() 387 c += c2 388 if c2 == '\\': 389 c += self.getEscape() 390 391 while 1: 392 c2 = self.nextChar() 393 if c2 == tokEOF: 394 break 395 c += c2 396 if c2 == "'": 397 break 398 399 return tok.set(tokSTRING, c) 400 401 if c == '"': # strings 402 quote = 0 403 while 1: 404 c2 = self.nextChar() 405 if c2 == tokEOF: 406 return tok.set(tokSTRING,c) 407 408 c += c2 409 if not quote: 410 if c2 == '"': 411 return tok.set(tokSTRING,c) 412 if c2 == "\\": 413 quote = 1 414 else: 415 quote = 0 416 417 if c >= "0" and c <= "9": # integers ? 418 while 1: 419 c2 = self.peekChar() 420 if c2 == tokLN or (not c2.isalnum() and c2 != "_"): 421 break 422 c += c2 423 self.skipChar() 424 return tok.set(tokNUMBER,c) 425 426 if c.isalnum() or c == "_": # identifiers ? 427 while 1: 428 c2 = self.peekChar() 429 if c2 == tokLN or (not c2.isalnum() and c2 != "_"): 430 break 431 c += c2 432 self.skipChar() 433 if c == tokDEFINED: 434 return tok.set(tokDEFINED) 435 else: 436 return tok.set(tokIDENT,c) 437 438 # check special symbols 439 for sk in cppLongSymbols: 440 if c == sk[0]: 441 sklen = len(sk[1:]) 442 if self.pos + sklen <= self.len and \ 443 self.text[self.pos:self.pos+sklen] == sk[1:]: 444 self.pos += sklen 445 return tok.set(sk) 446 447 return tok.set(c) 448 449 def nextToken(self,tok): 450 """return the next token from the input text. this function 451 really updates 'tok', and does not return a new one""" 452 self.markPos(tok) 453 self.nextRealToken(tok) 454 455 def getToken(self): 456 tok = Token() 457 self.nextToken(tok) 458 if debugTokens: 459 print "getTokens: %s" % repr(tok) 460 return tok 461 462 def toTokenList(self): 463 """convert the input text of a CppTokenizer into a direct 464 list of token objects. tokEOF is stripped from the result""" 465 result = [] 466 while 1: 467 tok = Token() 468 self.nextToken(tok) 469 if tok.id == tokEOF: 470 break 471 result.append(tok) 472 return result 473 474class CppLineTokenizer(CppTokenizer): 475 """a CppTokenizer derived class that accepts a single line of text as input""" 476 def __init__(self,line,lineno=1): 477 CppTokenizer.__init__(self) 478 self.line = lineno 479 self.setLineText(line) 480 481 482class CppLinesTokenizer(CppTokenizer): 483 """a CppTokenizer derived class that accepts a list of texdt lines as input. 484 the lines must not have a trailing \n""" 485 def __init__(self,lines=[],lineno=1): 486 """initialize a CppLinesTokenizer. you can later add lines using addLines()""" 487 CppTokenizer.__init__(self) 488 self.line = lineno 489 self.lines = lines 490 self.index = 0 491 self.count = len(lines) 492 493 if self.count > 0: 494 self.fillLineText() 495 else: 496 self.eof = True 497 498 def addLine(self,line): 499 """add a line to a CppLinesTokenizer. this can be done after tokenization 500 happens""" 501 if self.count == 0: 502 self.setLineText(line) 503 self.index = 1 504 self.lines.append(line) 505 self.count += 1 506 self.eof = False 507 508 def fillLineText(self): 509 if self.index < self.count: 510 self.setLineText(self.lines[self.index]) 511 self.index += 1 512 else: 513 self.eof = True 514 515 516class CppFileTokenizer(CppTokenizer): 517 def __init__(self,file,lineno=1): 518 CppTokenizer.__init__(self) 519 self.file = file 520 self.line = lineno 521 522 def fillLineText(self): 523 line = self.file.readline() 524 if len(line) > 0: 525 if line[-1] == '\n': 526 line = line[:-1] 527 if len(line) > 0 and line[-1] == "\r": 528 line = line[:-1] 529 self.setLineText(line) 530 else: 531 self.eof = True 532 533# Unit testing 534# 535class CppTokenizerTester: 536 """a class used to test CppTokenizer classes""" 537 def __init__(self,tokenizer=None): 538 self.tokenizer = tokenizer 539 self.token = Token() 540 541 def setTokenizer(self,tokenizer): 542 self.tokenizer = tokenizer 543 544 def expect(self,id): 545 self.tokenizer.nextToken(self.token) 546 tokid = self.token.id 547 if tokid == id: 548 return 549 if self.token.value == id and (tokid == tokIDENT or tokid == tokNUMBER): 550 return 551 raise BadExpectedToken, "### BAD TOKEN: '%s' expecting '%s'" % (self.token.id,id) 552 553 def expectToken(self,id,line,col): 554 self.expect(id) 555 if self.token.lineno != line: 556 raise BadExpectedToken, "### BAD LINENO: token '%s' got '%d' expecting '%d'" % (id,self.token.lineno,line) 557 if self.token.colno != col: 558 raise BadExpectedToken, "### BAD COLNO: '%d' expecting '%d'" % (self.token.colno,col) 559 560 def expectTokenVal(self,id,value,line,col): 561 self.expectToken(id,line,col) 562 if self.token.value != value: 563 raise BadExpectedToken, "### BAD VALUE: '%s' expecting '%s'" % (self.token.value,value) 564 565 def expectList(self,list): 566 for item in list: 567 self.expect(item) 568 569def test_CppTokenizer(): 570 print "running CppTokenizer tests" 571 tester = CppTokenizerTester() 572 573 tester.setTokenizer( CppLineTokenizer("#an/example && (01923_xy)") ) 574 tester.expectList( ["#", "an", "/", "example", tokSPACE, tokLOGICAND, tokSPACE, tokLPAREN, "01923_xy", \ 575 tokRPAREN, tokLN, tokEOF] ) 576 577 tester.setTokenizer( CppLineTokenizer("FOO(BAR) && defined(BAZ)") ) 578 tester.expectList( ["FOO", tokLPAREN, "BAR", tokRPAREN, tokSPACE, tokLOGICAND, tokSPACE, 579 tokDEFINED, tokLPAREN, "BAZ", tokRPAREN, tokLN, tokEOF] ) 580 581 tester.setTokenizer( CppLinesTokenizer( ["/*", "#", "*/"] ) ) 582 tester.expectList( [ tokSPACE, tokLN, tokEOF ] ) 583 584 tester.setTokenizer( CppLinesTokenizer( ["first", "second"] ) ) 585 tester.expectList( [ "first", tokLN, "second", tokLN, tokEOF ] ) 586 587 tester.setTokenizer( CppLinesTokenizer( ["first second", " third"] ) ) 588 tester.expectToken( "first", 1, 0 ) 589 tester.expectToken( tokSPACE, 1, 5 ) 590 tester.expectToken( "second", 1, 6 ) 591 tester.expectToken( tokLN, 1, 12 ) 592 tester.expectToken( tokSPACE, 2, 0 ) 593 tester.expectToken( "third", 2, 2 ) 594 595 tester.setTokenizer( CppLinesTokenizer( [ "boo /* what the", "hell */" ] ) ) 596 tester.expectList( [ "boo", tokSPACE ] ) 597 tester.expectTokenVal( tokSPACE, "/* what the\nhell */", 1, 4 ) 598 tester.expectList( [ tokLN, tokEOF ] ) 599 600 tester.setTokenizer( CppLinesTokenizer( [ "an \\", " example" ] ) ) 601 tester.expectToken( "an", 1, 0 ) 602 tester.expectToken( tokSPACE, 1, 2 ) 603 tester.expectTokenVal( tokSPACE, "\\", 1, 3 ) 604 tester.expectToken( tokSPACE, 2, 0 ) 605 tester.expectToken( "example", 2, 1 ) 606 tester.expectToken( tokLN, 2, 8 ) 607 608 return True 609 610 611##################################################################################### 612##################################################################################### 613##### ##### 614##### C P P E X P R E S S I O N S ##### 615##### ##### 616##################################################################################### 617##################################################################################### 618 619# Cpp expressions are modeled by tuples of the form (op,arg) or (op,arg1,arg2), etc.. 620# op is an "operator" string 621 622class Expr: 623 """a class used to model a CPP expression""" 624 opInteger = "int" 625 opIdent = "ident" 626 opCall = "call" 627 opDefined = "defined" 628 opTest = "?" 629 opLogicNot = "!" 630 opNot = "~" 631 opNeg = "[-]" 632 opUnaryPlus = "[+]" 633 opAdd = "+" 634 opSub = "-" 635 opMul = "*" 636 opDiv = "/" 637 opMod = "%" 638 opAnd = "&" 639 opOr = "|" 640 opXor = "^" 641 opLogicAnd = "&&" 642 opLogicOr = "||" 643 opEqual = "==" 644 opNotEqual = "!=" 645 opLess = "<" 646 opLessEq = "<=" 647 opGreater = ">" 648 opGreaterEq = ">=" 649 opShl = "<<" 650 opShr = ">>" 651 652 unaries = [ opLogicNot, opNot, opNeg, opUnaryPlus ] 653 binaries = [ opAdd, opSub, opMul, opDiv, opMod, opAnd, opOr, opXor, opLogicAnd, opLogicOr, 654 opEqual, opNotEqual, opLess, opLessEq, opGreater, opGreaterEq ] 655 656 precedences = { 657 opTest: 0, 658 opLogicOr: 1, 659 opLogicNot: 2, 660 opOr : 3, 661 opXor: 4, 662 opAnd: 5, 663 opEqual: 6, opNotEqual: 6, 664 opLess:7, opLessEq:7, opGreater:7, opGreaterEq:7, 665 opShl:8, opShr:8, 666 opAdd:9, opSub:9, 667 opMul:10, opDiv:10, opMod:10, 668 opLogicNot:11, 669 opNot: 12, 670 } 671 672 def __init__(self,op): 673 self.op = op 674 675 def __repr__(self): 676 return "(%s)" % self.op 677 678 def __str__(self): 679 return "operator(%s)" % self.op 680 681 def precedence(self): 682 """return the precedence of a given operator""" 683 return Expr.precedences.get(self.op, 1000) 684 685 def isUnary(self): 686 return self.op in Expr.unaries 687 688 def isBinary(self): 689 return self.op in Expr.binaries 690 691 def isDefined(self): 692 return self.op is opDefined 693 694 def toInt(self): 695 """return the integer value of a given expression. only valid for integer expressions 696 will return None otherwise""" 697 return None 698 699class IntExpr(Expr): 700 def __init__(self,value): 701 Expr.__init__(self,opInteger) 702 self.arg = value 703 704 def __repr__(self): 705 return "(int %s)" % self.arg 706 707 def __str__(self): 708 return self.arg 709 710 def toInt(self): 711 s = self.arg # string value 712 # get rid of U or L suffixes 713 while len(s) > 0 and s[-1] in "LUlu": 714 s = s[:-1] 715 return string.atoi(s) 716 717class IdentExpr(Expr): 718 def __init__(self,name): 719 Expr.__init__(self,opIdent) 720 self.name = name 721 722 def __repr__(self): 723 return "(ident %s)" % self.name 724 725 def __str__(self): 726 return self.name 727 728class CallExpr(Expr): 729 def __init__(self,funcname,params): 730 Expr.__init__(self,opCall) 731 self.funcname = funcname 732 self.params = params 733 734 def __repr__(self): 735 result = "(call %s [" % self.funcname 736 comma = "" 737 for param in self.params: 738 result += "%s%s" % (comma, repr(param)) 739 comma = "," 740 result += "])" 741 return result 742 743 def __str__(self): 744 result = "%s(" % self.funcname 745 comma = "" 746 for param in self.params: 747 result += "%s%s" % (comma, str(param)) 748 comma = "," 749 750 result += ")" 751 return result 752 753class TestExpr(Expr): 754 def __init__(self,cond,iftrue,iffalse): 755 Expr.__init__(self,opTest) 756 self.cond = cond 757 self.iftrue = iftrue 758 self.iffalse = iffalse 759 760 def __repr__(self): 761 return "(?: %s %s %s)" % (repr(self.cond),repr(self.iftrue),repr(self.iffalse)) 762 763 def __str__(self): 764 return "(%s) ? (%s) : (%s)" % (self.cond, self.iftrue, self.iffalse) 765 766class SingleArgExpr(Expr): 767 def __init__(self,op,arg): 768 Expr.__init__(self,op) 769 self.arg = arg 770 771 def __repr__(self): 772 return "(%s %s)" % (self.op, repr(self.arg)) 773 774class DefinedExpr(SingleArgExpr): 775 def __init__(self,op,macroname): 776 SingleArgExpr.__init__(self.opDefined,macroname) 777 778 def __str__(self): 779 return "defined(%s)" % self.arg 780 781 782class UnaryExpr(SingleArgExpr): 783 def __init__(self,op,arg,opstr=None): 784 SingleArgExpr.__init__(self,op,arg) 785 if not opstr: 786 opstr = op 787 self.opstr = opstr 788 789 def __str__(self): 790 arg_s = str(self.arg) 791 arg_prec = self.arg.precedence() 792 self_prec = self.precedence() 793 if arg_prec < self_prec: 794 return "%s(%s)" % (self.opstr,arg_s) 795 else: 796 return "%s%s" % (self.opstr, arg_s) 797 798class TwoArgExpr(Expr): 799 def __init__(self,op,arg1,arg2): 800 Expr.__init__(self,op) 801 self.arg1 = arg1 802 self.arg2 = arg2 803 804 def __repr__(self): 805 return "(%s %s %s)" % (self.op, repr(self.arg1), repr(self.arg2)) 806 807class BinaryExpr(TwoArgExpr): 808 def __init__(self,op,arg1,arg2,opstr=None): 809 TwoArgExpr.__init__(self,op,arg1,arg2) 810 if not opstr: 811 opstr = op 812 self.opstr = opstr 813 814 def __str__(self): 815 arg1_s = str(self.arg1) 816 arg2_s = str(self.arg2) 817 arg1_prec = self.arg1.precedence() 818 arg2_prec = self.arg2.precedence() 819 self_prec = self.precedence() 820 821 result = "" 822 if arg1_prec < self_prec: 823 result += "(%s)" % arg1_s 824 else: 825 result += arg1_s 826 827 result += " %s " % self.opstr 828 829 if arg2_prec < self_prec: 830 result += "(%s)" % arg2_s 831 else: 832 result += arg2_s 833 834 return result 835 836##################################################################################### 837##################################################################################### 838##### ##### 839##### C P P E X P R E S S I O N P A R S E R ##### 840##### ##### 841##################################################################################### 842##################################################################################### 843 844 845class ExprParser: 846 """a class used to convert a list of tokens into a cpp Expr object""" 847 848 re_octal = re.compile(r"\s*\(0[0-7]+\).*") 849 re_decimal = re.compile(r"\s*\(\d+[ulUL]*\).*") 850 re_hexadecimal = re.compile(r"\s*\(0[xX][0-9a-fA-F]*\).*") 851 852 def __init__(self,tokens): 853 self.tok = tokens 854 self.n = len(self.tok) 855 self.i = 0 856 857 def mark(self): 858 return self.i 859 860 def release(self,pos): 861 self.i = pos 862 863 def peekId(self): 864 if self.i < self.n: 865 return self.tok[self.i].id 866 return None 867 868 def peek(self): 869 if self.i < self.n: 870 return self.tok[self.i] 871 return None 872 873 def skip(self): 874 if self.i < self.n: 875 self.i += 1 876 877 def skipOptional(self,id): 878 if self.i < self.n and self.tok[self.i].id == id: 879 self.i += 1 880 881 def skipSpaces(self): 882 i = self.i 883 n = self.n 884 tok = self.tok 885 while i < n and (tok[i] == tokSPACE or tok[i] == tokLN): 886 i += 1 887 self.i = i 888 889 # all the isXXX functions returns a (expr,nextpos) pair if a match is found 890 # or None if not 891 892 def is_integer(self): 893 id = self.tok[self.i].id 894 c = id[0] 895 if c < '0' or c > '9': 896 return None 897 898 m = ExprParser.re_octal.match(id) 899 if m: 900 return (IntExpr(id), m.end(1)) 901 902 m = ExprParser.re_decimal.match(id) 903 if m: 904 return (IntExpr(id), m.end(1)) 905 906 m = ExprParser.re_hexadecimal(id) 907 if m: 908 return (IntExpr(id), m.end(1)) 909 910 return None 911 912 def is_defined(self): 913 id = self.tok[self.i].id 914 if id != "defined": 915 return None 916 917 pos = self.mark() 918 919 use_paren = 0 920 if self.peekId() == tokLPAREN: 921 self.skip() 922 use_paren = 1 923 924 if self.peekId() != tokIDENT: 925 self.throw( BadExpectedToken, "identifier expected") 926 927 macroname = self.peek().value 928 self.skip() 929 if use_paren: 930 self.skipSpaces() 931 if self.peekId() != tokRPAREN: 932 self.throw( BadExpectedToken, "missing right-paren after 'defined' directive") 933 self.skip() 934 935 i = self.i 936 return (DefinedExpr(macroname),i+1) 937 938 def is_call_or_ident(self): 939 pass 940 941 def parse(self, i): 942 return None 943 944##################################################################################### 945##################################################################################### 946##### ##### 947##### C P P E X P R E S S I O N S ##### 948##### ##### 949##################################################################################### 950##################################################################################### 951 952class CppInvalidExpression(Exception): 953 """an exception raised when an invalid/unsupported cpp expression is detected""" 954 pass 955 956class CppExpr: 957 """a class that models the condition of #if directives into 958 an expression tree. each node in the tree is of the form (op,arg) or (op,arg1,arg2) 959 where "op" is a string describing the operation""" 960 961 unaries = [ "!", "~" ] 962 binaries = [ "+", "-", "<", "<=", ">=", ">", "&&", "||", "*", "/", "%", "&", "|", "^", "<<", ">>", "==", "!=" ] 963 precedences = { "||": 1, 964 "&&": 2, 965 "|": 3, 966 "^": 4, 967 "&": 5, 968 "==":6, "!=":6, 969 "<":7, "<=":7, ">":7, ">=":7, 970 "<<":8, ">>":8, 971 "+":9, "-":9, 972 "*":10, "/":10, "%":10, 973 "!":11, "~":12 974 } 975 976 def __init__(self, tokens): 977 """initialize a CppExpr. 'tokens' must be a CppToken list""" 978 self.tok = tokens 979 self.n = len(tokens) 980 if debugCppExpr: 981 print "CppExpr: trying to parse %s" % repr(tokens) 982 expr = self.is_expr(0) 983 if debugCppExpr: 984 print "CppExpr: got " + repr(expr) 985 self.expr = expr[0] 986 987 re_cpp_constant = re.compile(r"((\d|\w|_)+)") 988 989 def throw(self,exception,i,msg): 990 if i < self.n: 991 tok = self.tok[i] 992 print "%d:%d: %s" % (tok.lineno,tok.colno,msg) 993 else: 994 print "EOF: %s" % msg 995 raise exception 996 997 def skip_spaces(self,i): 998 """skip spaces in input token list""" 999 while i < self.n: 1000 t = self.tok[i] 1001 if t.id != tokSPACE and t.id != tokLN: 1002 break 1003 i += 1 1004 return i 1005 1006 def expectId(self,i,id): 1007 """check that a given token id is at the current position, then skip over it""" 1008 i = self.skip_spaces(i) 1009 if i >= self.n or self.tok[i].id != id: 1010 self.throw(BadExpectedToken,i,"### expecting '%s' in expression, got '%s'" % (id, self.tok[i].id)) 1011 return i+1 1012 1013 def expectIdent(self,i): 1014 i = self.skip_spaces(i) 1015 if i >= self.n or self.tok[i].id != tokIDENT: 1016 self.throw(BadExpectedToken,i,"### expecting identifier in expression, got '%s'" % (id, self.tok[i].id)) 1017 return i+1 1018 1019 # the is_xxxxx function returns either None or a pair (e,nextpos) 1020 # where 'e' is an expression tuple (e.g. (op,arg)) and 'nextpos' is 1021 # the corresponding next position in the input token list 1022 # 1023 1024 def is_decimal(self,i): 1025 v = self.tok[i].value[:] 1026 while len(v) > 0 and v[-1] in "ULul": 1027 v = v[:-1] 1028 for digit in v: 1029 if not digit.isdigit(): 1030 return None 1031 1032 # for an integer expression tuple, the argument 1033 # is simply the value as an integer 1034 val = string.atoi(v) 1035 return ("int", val), i+1 1036 1037 def is_hexadecimal(self,i): 1038 v = self.tok[i].value[:] 1039 while len(v) > 0 and v[-1] in "ULul": 1040 v = v[:-1] 1041 if len(v) > 2 and (v[0:2] == "0x" or v[0:2] == "0X"): 1042 for digit in v[2:]: 1043 if not digit in "0123456789abcdefABCDEF": 1044 return None 1045 1046 # for an hex expression tuple, the argument 1047 # is the value as an integer 1048 val = int(v[2:], 16) 1049 return ("hex", val), i+1 1050 1051 return None 1052 1053 def is_integer(self,i): 1054 if self.tok[i].id != tokNUMBER: 1055 return None 1056 1057 c = self.is_decimal(i) 1058 if c: return c 1059 1060 c = self.is_hexadecimal(i) 1061 if c: return c 1062 1063 return None 1064 1065 def is_number(self,i): 1066 t = self.tok[i] 1067 if t.id == tokMINUS and i+1 < self.n: 1068 c = self.is_integer(i+1) 1069 if c: 1070 e, i2 = c 1071 op, val = e 1072 return (op, -val), i2 1073 if t.id == tokPLUS and i+1 < self.n: 1074 c = self.is_integer(i+1) 1075 if c: return c 1076 1077 return self.is_integer(i) 1078 1079 1080 def is_alnum(self,i): 1081 """test wether a given token is alpha-numeric""" 1082 i = self.skip_spaces(i) 1083 if i >= self.n: 1084 return None 1085 t = self.tok[i] 1086 m = CppExpr.re_cpp_constant.match(t.id) 1087 if m: 1088 #print "... alnum '%s'" % m.group(1) 1089 r = m.group(1) 1090 return ("ident", r), i+1 1091 return None 1092 1093 def is_defined(self,i): 1094 t = self.tok[i] 1095 if t.id != tokDEFINED: 1096 return None 1097 1098 # we have the defined keyword, check the rest 1099 i = self.skip_spaces(i+1) 1100 use_parens = 0 1101 if i < self.n and self.tok[i].id == tokLPAREN: 1102 use_parens = 1 1103 i = self.skip_spaces(i+1) 1104 1105 if i >= self.n: 1106 self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name or left paren") 1107 1108 t = self.tok[i] 1109 if t.id != tokIDENT: 1110 self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name") 1111 1112 i += 1 1113 if use_parens: 1114 i = self.expectId(i,tokRPAREN) 1115 1116 return ("defined",t.value), i 1117 1118 1119 def is_call_or_ident(self,i): 1120 i = self.skip_spaces(i) 1121 if i >= self.n: 1122 return None 1123 1124 t = self.tok[i] 1125 if t.id != tokIDENT: 1126 return None 1127 1128 name = t.value 1129 1130 i = self.skip_spaces(i+1) 1131 if i >= self.n or self.tok[i].id != tokLPAREN: 1132 return ("ident", name), i 1133 1134 params = [] 1135 depth = 1 1136 i += 1 1137 j = i 1138 while i < self.n: 1139 id = self.tok[i].id 1140 if id == tokLPAREN: 1141 depth += 1 1142 elif depth == 1 and (id == tokCOMMA or id == tokRPAREN): 1143 while j < i and self.tok[j].id == tokSPACE: 1144 j += 1 1145 k = i 1146 while k > j and self.tok[k-1].id == tokSPACE: 1147 k -= 1 1148 param = self.tok[j:k] 1149 params.append( param ) 1150 if id == tokRPAREN: 1151 break 1152 j = i+1 1153 elif id == tokRPAREN: 1154 depth -= 1 1155 i += 1 1156 1157 if i >= self.n: 1158 return None 1159 1160 return ("call", (name, params)), i+1 1161 1162 def is_token(self,i,token): 1163 i = self.skip_spaces(i) 1164 if i >= self.n or self.tok[i].id != token: 1165 return None 1166 return token, i+1 1167 1168 1169 def is_value(self,i): 1170 t = self.tok[i] 1171 if t.id == tokSTRING: 1172 return ("string", t.value), i+1 1173 1174 c = self.is_number(i) 1175 if c: return c 1176 1177 c = self.is_defined(i) 1178 if c: return c 1179 1180 c = self.is_call_or_ident(i) 1181 if c: return c 1182 1183 i = self.skip_spaces(i) 1184 if i >= self.n or self.tok[i].id != tokLPAREN: 1185 return None 1186 1187 popcount = 1 1188 i2 = i+1 1189 while i2 < self.n: 1190 t = self.tok[i2] 1191 if t.id == tokLPAREN: 1192 popcount += 1 1193 elif t.id == tokRPAREN: 1194 popcount -= 1 1195 if popcount == 0: 1196 break 1197 i2 += 1 1198 1199 if popcount != 0: 1200 self.throw(CppInvalidExpression, i, "expression missing closing parenthesis") 1201 1202 if debugCppExpr: 1203 print "CppExpr: trying to parse sub-expression %s" % repr(self.tok[i+1:i2]) 1204 oldcount = self.n 1205 self.n = i2 1206 c = self.is_expr(i+1) 1207 self.n = oldcount 1208 if not c: 1209 self.throw(CppInvalidExpression, i, "invalid expression within parenthesis") 1210 1211 e, i = c 1212 return e, i2+1 1213 1214 def is_unary(self,i): 1215 i = self.skip_spaces(i) 1216 if i >= self.n: 1217 return None 1218 1219 t = self.tok[i] 1220 if t.id in CppExpr.unaries: 1221 c = self.is_unary(i+1) 1222 if not c: 1223 self.throw(CppInvalidExpression, i, "%s operator must be followed by value" % t.id) 1224 e, i = c 1225 return (t.id, e), i 1226 1227 return self.is_value(i) 1228 1229 def is_binary(self,i): 1230 i = self.skip_spaces(i) 1231 if i >= self.n: 1232 return None 1233 1234 c = self.is_unary(i) 1235 if not c: 1236 return None 1237 1238 e1, i2 = c 1239 i2 = self.skip_spaces(i2) 1240 if i2 >= self.n: 1241 return c 1242 1243 t = self.tok[i2] 1244 if t.id in CppExpr.binaries: 1245 c = self.is_binary(i2+1) 1246 if not c: 1247 self.throw(CppInvalidExpression, i,"### %s operator must be followed by value" % t.id ) 1248 e2, i3 = c 1249 return (t.id, e1, e2), i3 1250 1251 return None 1252 1253 def is_expr(self,i): 1254 return self.is_binary(i) 1255 1256 def dump_node(self,e): 1257 op = e[0] 1258 line = "(" + op 1259 if op == "int": 1260 line += " %d)" % e[1] 1261 elif op == "hex": 1262 line += " 0x%x)" % e[1] 1263 elif op == "ident": 1264 line += " %s)" % e[1] 1265 elif op == "defined": 1266 line += " %s)" % e[1] 1267 elif op == "call": 1268 arg = e[1] 1269 line += " %s [" % arg[0] 1270 prefix = "" 1271 for param in arg[1]: 1272 par = "" 1273 for tok in param: 1274 par += str(tok) 1275 line += "%s%s" % (prefix, par) 1276 prefix = "," 1277 line += "])" 1278 elif op in CppExpr.unaries: 1279 line += " %s)" % self.dump_node(e[1]) 1280 elif op in CppExpr.binaries: 1281 line += " %s %s)" % (self.dump_node(e[1]), self.dump_node(e[2])) 1282 else: 1283 line += " ?%s)" % repr(e[1]) 1284 1285 return line 1286 1287 def __repr__(self): 1288 return self.dump_node(self.expr) 1289 1290 def source_node(self,e): 1291 op = e[0] 1292 if op == "int": 1293 return "%d" % e[1] 1294 if op == "hex": 1295 return "0x%x" % e[1] 1296 if op == "ident": 1297 # XXX: should try to expand 1298 return e[1] 1299 if op == "defined": 1300 return "defined(%s)" % e[1] 1301 1302 prec = CppExpr.precedences.get(op,1000) 1303 arg = e[1] 1304 if op in CppExpr.unaries: 1305 arg_src = self.source_node(arg) 1306 arg_op = arg[0] 1307 arg_prec = CppExpr.precedences.get(arg[0],1000) 1308 if arg_prec < prec: 1309 return "!(" + arg_src + ")" 1310 else: 1311 return "!" + arg_src 1312 if op in CppExpr.binaries: 1313 arg2 = e[2] 1314 arg1_op = arg[0] 1315 arg2_op = arg2[0] 1316 arg1_src = self.source_node(arg) 1317 arg2_src = self.source_node(arg2) 1318 if CppExpr.precedences.get(arg1_op,1000) < prec: 1319 arg1_src = "(%s)" % arg1_src 1320 if CppExpr.precedences.get(arg2_op,1000) < prec: 1321 arg2_src = "(%s)" % arg2_src 1322 1323 return "%s %s %s" % (arg1_src, op, arg2_src) 1324 return "???" 1325 1326 def __str__(self): 1327 return self.source_node(self.expr) 1328 1329 def int_node(self,e): 1330 if e[0] == "int": 1331 return e[1] 1332 elif e[1] == "hex": 1333 return int(e[1],16) 1334 else: 1335 return None 1336 1337 def toInt(self): 1338 return self.int_node(self.expr) 1339 1340 def optimize_node(self,e,macros={}): 1341 op = e[0] 1342 if op == "defined": 1343 name = e[1] 1344 if macros.has_key(name): 1345 if macros[name] == kCppUndefinedMacro: 1346 return ("int", 0) 1347 else: 1348 return ("int", 1) 1349 1350 if kernel_remove_config_macros and name.startswith("CONFIG_"): 1351 return ("int", 0) 1352 1353 elif op == "!": 1354 op, v = e 1355 v = self.optimize_node(v, macros) 1356 if v[0] == "int": 1357 if v[1] == 0: 1358 return ("int", 1) 1359 else: 1360 return ("int", 0) 1361 1362 elif op == "&&": 1363 op, l, r = e 1364 l = self.optimize_node(l, macros) 1365 r = self.optimize_node(r, macros) 1366 li = self.int_node(l) 1367 ri = self.int_node(r) 1368 if li != None: 1369 if li == 0: 1370 return ("int", 0) 1371 else: 1372 return r 1373 1374 elif op == "||": 1375 op, l, r = e 1376 l = self.optimize_node(l, macros) 1377 r = self.optimize_node(r, macros) 1378 li = self.int_node(l) 1379 ri = self.int_node(r) 1380 if li != None: 1381 if li == 0: 1382 return r 1383 else: 1384 return ("int", 1) 1385 elif ri != None: 1386 if ri == 0: 1387 return l 1388 else: 1389 return ("int", 1) 1390 return e 1391 1392 def optimize(self,macros={}): 1393 self.expr = self.optimize_node(self.expr,macros) 1394 1395 def removePrefixedNode(self,e,prefix,names): 1396 op = e[0] 1397 if op == "defined": 1398 name = e[1] 1399 if name.startswith(prefix): 1400 if names.has_key[name] and names[name] == "y": 1401 return ("int", 1) 1402 else: 1403 return ("int", 0) 1404 1405 elif op in CppExpr.unaries: 1406 op, v = e 1407 v = self.removePrefixedNode(v,prefix,names) 1408 return (op, v) 1409 elif op in CppExpr.binaries: 1410 op, v1, v2 = e 1411 v1 = self.removePrefixedNode(v1,prefix,names) 1412 v2 = self.removePrefixedNode(v2,prefix,names) 1413 return (op, v1, v2) 1414 elif op == "call": 1415 func, params = e[1] 1416 params2 = [] 1417 for param in params: 1418 params2.append( self.removePrefixedNode(param,prefix,names) ) 1419 return (op, (func, params2)) 1420 1421 return e 1422 1423 def removePrefixed(self,prefix,names={}): 1424 self.expr = self.removePrefixedNode(self.expr,prefix,names) 1425 1426 def is_equal_node(self,e1,e2): 1427 if e1[0] != e2[0] or len(e1) != len(e2): 1428 return False 1429 1430 op = e1[0] 1431 if op == "int" or op == "hex" or op == "!" or op == "defined": 1432 return e1[0] == e2[0] 1433 1434 return self.is_equal_node(e1[1],e2[1]) and self.is_equal_node(e1[2],e2[2]) 1435 1436 def is_equal(self,other): 1437 return self.is_equal_node(self.expr,other.expr) 1438 1439def test_cpp_expr(expr, expected): 1440 e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) 1441 #print repr(e.expr) 1442 s1 = repr(e) 1443 if s1 != expected: 1444 print "KO: expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) 1445 else: 1446 #print "OK: expression '%s'" % expr 1447 pass 1448 1449def test_cpp_expr_optim(expr, expected, macros={}): 1450 e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) 1451 e.optimize(macros) 1452 1453 s1 = repr(e) 1454 if s1 != expected: 1455 print "KO: optimized expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) 1456 else: 1457 #print "OK: optmized expression '%s'" % expr 1458 pass 1459 1460def test_cpp_expr_source(expr, expected): 1461 e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) 1462 s1 = str(e) 1463 if s1 != expected: 1464 print "KO: source expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) 1465 else: 1466 #print "OK: source expression '%s'" % expr 1467 pass 1468 1469def test_CppExpr(): 1470 print "testing CppExpr" 1471 test_cpp_expr( "0", "(int 0)" ) 1472 test_cpp_expr( "1", "(int 1)" ) 1473 test_cpp_expr( "1 && 1", "(&& (int 1) (int 1))" ) 1474 test_cpp_expr( "1 && 0", "(&& (int 1) (int 0))" ) 1475 test_cpp_expr( "EXAMPLE", "(ident EXAMPLE)" ) 1476 test_cpp_expr( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" ) 1477 test_cpp_expr( "defined(EXAMPLE)", "(defined EXAMPLE)" ) 1478 test_cpp_expr( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" ) 1479 test_cpp_expr( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" ) 1480 test_cpp_expr( "FOO(BAR)", "(call FOO [BAR])" ) 1481 1482 test_cpp_expr_optim( "0", "(int 0)" ) 1483 test_cpp_expr_optim( "1", "(int 1)" ) 1484 test_cpp_expr_optim( "1 && 1", "(int 1)" ) 1485 test_cpp_expr_optim( "1 && 0", "(int 0)" ) 1486 test_cpp_expr_optim( "0 && 1", "(int 0)" ) 1487 test_cpp_expr_optim( "0 && 0", "(int 0)" ) 1488 test_cpp_expr_optim( "1 || 1", "(int 1)" ) 1489 test_cpp_expr_optim( "1 || 0", "(int 1)" ) 1490 test_cpp_expr_optim( "0 || 1", "(int 1)" ) 1491 test_cpp_expr_optim( "0 || 0", "(int 0)" ) 1492 test_cpp_expr_optim( "EXAMPLE", "(ident EXAMPLE)" ) 1493 test_cpp_expr_optim( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" ) 1494 test_cpp_expr_optim( "defined(EXAMPLE)", "(defined EXAMPLE)" ) 1495 test_cpp_expr_optim( "defined(EXAMPLE)", "(int 1)", { "EXAMPLE": "XOWOE" } ) 1496 test_cpp_expr_optim( "defined(EXAMPLE)", "(int 0)", { "EXAMPLE": kCppUndefinedMacro} ) 1497 test_cpp_expr_optim( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" ) 1498 test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 0)", { "EXAMPLE" : "XOWOE" } ) 1499 test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 1)", { "EXAMPLE" : kCppUndefinedMacro } ) 1500 test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" ) 1501 test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "ABC" : "1" } ) 1502 test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "BINGO" : "1" } ) 1503 test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(defined ABC)", { "BINGO" : kCppUndefinedMacro } ) 1504 test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 0)", { "ABC" : kCppUndefinedMacro, "BINGO" : kCppUndefinedMacro } ) 1505 1506 test_cpp_expr_source( "0", "0" ) 1507 test_cpp_expr_source( "1", "1" ) 1508 test_cpp_expr_source( "1 && 1", "1 && 1" ) 1509 test_cpp_expr_source( "1 && 0", "1 && 0" ) 1510 test_cpp_expr_source( "0 && 1", "0 && 1" ) 1511 test_cpp_expr_source( "0 && 0", "0 && 0" ) 1512 test_cpp_expr_source( "1 || 1", "1 || 1" ) 1513 test_cpp_expr_source( "1 || 0", "1 || 0" ) 1514 test_cpp_expr_source( "0 || 1", "0 || 1" ) 1515 test_cpp_expr_source( "0 || 0", "0 || 0" ) 1516 test_cpp_expr_source( "EXAMPLE", "EXAMPLE" ) 1517 test_cpp_expr_source( "EXAMPLE - 3", "EXAMPLE - 3" ) 1518 test_cpp_expr_source( "defined(EXAMPLE)", "defined(EXAMPLE)" ) 1519 test_cpp_expr_source( "defined EXAMPLE", "defined(EXAMPLE)" ) 1520 1521 1522##################################################################################### 1523##################################################################################### 1524##### ##### 1525##### C P P B L O C K ##### 1526##### ##### 1527##################################################################################### 1528##################################################################################### 1529 1530class Block: 1531 """a class used to model a block of input source text. there are two block types: 1532 - directive blocks: contain the tokens of a single pre-processor directive (e.g. #if) 1533 - text blocks, contain the tokens of non-directive blocks 1534 1535 the cpp parser class below will transform an input source file into a list of Block 1536 objects (grouped in a BlockList object for convenience)""" 1537 1538 def __init__(self,tokens,directive=None,lineno=0): 1539 """initialize a new block, if 'directive' is None, this is a text block 1540 NOTE: this automatically converts '#ifdef MACRO' into '#if defined(MACRO)' 1541 and '#ifndef MACRO' into '#if !defined(MACRO)'""" 1542 if directive == "ifdef": 1543 tok = Token() 1544 tok.set(tokDEFINED) 1545 tokens = [ tok ] + tokens 1546 directive = "if" 1547 1548 elif directive == "ifndef": 1549 tok1 = Token() 1550 tok2 = Token() 1551 tok1.set(tokNOT) 1552 tok2.set(tokDEFINED) 1553 tokens = [ tok1, tok2 ] + tokens 1554 directive = "if" 1555 1556 self.tokens = tokens 1557 self.directive = directive 1558 if lineno > 0: 1559 self.lineno = lineno 1560 else: 1561 self.lineno = self.tokens[0].lineno 1562 1563 if self.isIf(): 1564 self.expr = CppExpr( self.tokens ) 1565 1566 def isDirective(self): 1567 """returns True iff this is a directive block""" 1568 return self.directive != None 1569 1570 def isConditional(self): 1571 """returns True iff this is a conditional directive block""" 1572 return self.directive in ["if","ifdef","ifndef","else","elif","endif"] 1573 1574 def isDefine(self): 1575 """returns the macro name in a #define directive, or None otherwise""" 1576 if self.directive != "define": 1577 return None 1578 1579 return self.tokens[0].value 1580 1581 def isIf(self): 1582 """returns True iff this is an #if-like directive block""" 1583 return self.directive in ["if","ifdef","ifndef","elif"] 1584 1585 def isInclude(self): 1586 """checks wether this is a #include directive. if true, then returns the 1587 corresponding file name (with brackets or double-qoutes). None otherwise""" 1588 if self.directive != "include": 1589 return None 1590 1591 #print "iii " + repr(self.tokens) 1592 if self.tokens[0].id == tokSTRING: 1593 # a double-quote include, that's easy 1594 return self.tokens[0].value 1595 1596 # we only want the bracket part, not any comments or junk after it 1597 if self.tokens[0].id == "<": 1598 i = 0 1599 tok = self.tokens 1600 n = len(tok) 1601 while i < n and tok[i].id != ">": 1602 i += 1 1603 1604 if i >= n: 1605 return None 1606 1607 return string.join([ str(x) for x in tok[:i+1] ],"") 1608 1609 else: 1610 return None 1611 1612 def removeWhiteSpace(self): 1613 # Remove trailing whitespace and empty lines 1614 # All whitespace is also contracted to a single space 1615 if self.directive != None: 1616 return 1617 1618 tokens = [] 1619 line = 0 # index of line start 1620 space = -1 # index of first space, or -1 1621 ii = 0 1622 nn = len(self.tokens) 1623 while ii < nn: 1624 tok = self.tokens[ii] 1625 1626 # If we find a space, record its position if this is the first 1627 # one the line start or the previous character. Don't append 1628 # anything to tokens array yet though. 1629 if tok.id == tokSPACE: 1630 if space < 0: 1631 space = ii 1632 ii += 1 1633 continue 1634 1635 # If this is a line space, ignore the spaces we found previously 1636 # on the line, and remove empty lines. 1637 if tok.id == tokLN: 1638 old_line = line 1639 old_space = space 1640 #print "N line=%d space=%d ii=%d" % (line, space, ii) 1641 ii += 1 1642 line = ii 1643 space = -1 1644 if old_space == old_line: # line only contains spaces 1645 #print "-s" 1646 continue 1647 if ii-1 == old_line: # line is empty 1648 #print "-e" 1649 continue 1650 tokens.append(tok) 1651 continue 1652 1653 # Other token, append any space range if any, converting each 1654 # one to a single space character, then append the token. 1655 if space >= 0: 1656 jj = space 1657 space = -1 1658 while jj < ii: 1659 tok2 = self.tokens[jj] 1660 tok2.value = " " 1661 tokens.append(tok2) 1662 jj += 1 1663 1664 tokens.append(tok) 1665 ii += 1 1666 1667 self.tokens = tokens 1668 1669 def writeWithWarning(self,out,warning,left_count,repeat_count): 1670 # removeWhiteSpace() will sometimes creates non-directive blocks 1671 # without any tokens. These come from blocks that only contained 1672 # empty lines and spaces. They should not be printed in the final 1673 # output, and then should not be counted for this operation. 1674 # 1675 if not self.directive and self.tokens == []: 1676 return left_count 1677 1678 if self.directive: 1679 out.write(str(self) + "\n") 1680 left_count -= 1 1681 if left_count == 0: 1682 out.write(warning) 1683 left_count = repeat_count 1684 1685 else: 1686 for tok in self.tokens: 1687 out.write(str(tok)) 1688 if tok.id == tokLN: 1689 left_count -= 1 1690 if left_count == 0: 1691 out.write(warning) 1692 left_count = repeat_count 1693 1694 return left_count 1695 1696 1697 def __repr__(self): 1698 """generate the representation of a given block""" 1699 if self.directive: 1700 result = "#%s " % self.directive 1701 if self.isIf(): 1702 result += repr(self.expr) 1703 else: 1704 for tok in self.tokens: 1705 result += repr(tok) 1706 else: 1707 result = "" 1708 for tok in self.tokens: 1709 result += repr(tok) 1710 1711 return result 1712 1713 def __str__(self): 1714 """generate the string representation of a given block""" 1715 if self.directive: 1716 if self.directive == "if": 1717 # small optimization to re-generate #ifdef and #ifndef 1718 e = self.expr.expr 1719 op = e[0] 1720 if op == "defined": 1721 result = "#ifdef %s" % e[1] 1722 elif op == "!" and e[1][0] == "defined": 1723 result = "#ifndef %s" % e[1][1] 1724 else: 1725 result = "#if " + str(self.expr) 1726 else: 1727 result = "#%s" % self.directive 1728 if len(self.tokens): 1729 result += " " 1730 for tok in self.tokens: 1731 result += str(tok) 1732 else: 1733 result = "" 1734 for tok in self.tokens: 1735 result += str(tok) 1736 1737 return result 1738 1739class BlockList: 1740 """a convenience class used to hold and process a list of blocks returned by 1741 the cpp parser""" 1742 def __init__(self,blocks): 1743 self.blocks = blocks 1744 1745 def __len__(self): 1746 return len(self.blocks) 1747 1748 def __getitem__(self,n): 1749 return self.blocks[n] 1750 1751 def __repr__(self): 1752 return repr(self.blocks) 1753 1754 def __str__(self): 1755 result = "" 1756 for b in self.blocks: 1757 result += str(b) 1758 if b.isDirective(): 1759 result += '\n' 1760 return result 1761 1762 def optimizeIf01(self): 1763 """remove the code between #if 0 .. #endif in a BlockList""" 1764 self.blocks = optimize_if01(self.blocks) 1765 1766 def optimizeMacros(self, macros): 1767 """remove known defined and undefined macros from a BlockList""" 1768 for b in self.blocks: 1769 if b.isIf(): 1770 b.expr.optimize(macros) 1771 1772 def removeMacroDefines(self,macros): 1773 """remove known macro definitions from a BlockList""" 1774 self.blocks = remove_macro_defines(self.blocks,macros) 1775 1776 def removePrefixed(self,prefix,names): 1777 for b in self.blocks: 1778 if b.isIf(): 1779 b.expr.removePrefixed(prefix,names) 1780 1781 def removeWhiteSpace(self): 1782 for b in self.blocks: 1783 b.removeWhiteSpace() 1784 1785 def optimizeAll(self,macros): 1786 self.optimizeMacros(macros) 1787 self.optimizeIf01() 1788 return 1789 1790 def findIncludes(self): 1791 """return the list of included files in a BlockList""" 1792 result = [] 1793 for b in self.blocks: 1794 i = b.isInclude() 1795 if i: 1796 result.append(i) 1797 1798 return result 1799 1800 1801 def write(self,out): 1802 out.write(str(self)) 1803 1804 def writeWithWarning(self,out,warning,repeat_count): 1805 left_count = repeat_count 1806 for b in self.blocks: 1807 left_count = b.writeWithWarning(out,warning,left_count,repeat_count) 1808 1809 def removeComments(self): 1810 for b in self.blocks: 1811 for tok in b.tokens: 1812 if tok.id == tokSPACE: 1813 tok.value = " " 1814 1815 def removeVarsAndFuncs(self,knownStatics=set()): 1816 """remove all extern and static declarations corresponding 1817 to variable and function declarations. we only accept typedefs 1818 and enum/structs/union declarations. 1819 1820 however, we keep the definitions corresponding to the set 1821 of known static inline functions in the set 'knownStatics', 1822 which is useful for optimized byteorder swap functions and 1823 stuff like that. 1824 """ 1825 # state = 0 => normal (i.e. LN + spaces) 1826 # state = 1 => typedef/struct encountered, ends with ";" 1827 # state = 2 => var declaration encountered, ends with ";" 1828 # state = 3 => func declaration encountered, ends with "}" 1829 state = 0 1830 depth = 0 1831 blocks2 = [] 1832 skipTokens = False 1833 for b in self.blocks: 1834 if b.isDirective(): 1835 blocks2.append(b) 1836 else: 1837 n = len(b.tokens) 1838 i = 0 1839 if skipTokens: 1840 first = n 1841 else: 1842 first = 0 1843 while i < n: 1844 tok = b.tokens[i] 1845 tokid = tok.id 1846 # If we are not looking for the start of a new 1847 # type/var/func, then skip over tokens until 1848 # we find our terminator, managing the depth of 1849 # accolades as we go. 1850 if state > 0: 1851 terminator = False 1852 if tokid == '{': 1853 depth += 1 1854 elif tokid == '}': 1855 if depth > 0: 1856 depth -= 1 1857 if (depth == 0) and (state == 3): 1858 terminator = True 1859 elif tokid == ';' and depth == 0: 1860 terminator = True 1861 1862 if terminator: 1863 # we found the terminator 1864 state = 0 1865 if skipTokens: 1866 skipTokens = False 1867 first = i+1 1868 1869 i = i+1 1870 continue 1871 1872 # We are looking for the start of a new type/func/var 1873 # ignore whitespace 1874 if tokid in [tokLN, tokSPACE]: 1875 i = i+1 1876 continue 1877 1878 # Is it a new type definition, then start recording it 1879 if tok.value in [ 'struct', 'typedef', 'enum', 'union', '__extension__' ]: 1880 #print "$$$ keep type declr" + repr(b.tokens[i:]) 1881 state = 1 1882 i = i+1 1883 continue 1884 1885 # Is it a variable or function definition. If so, first 1886 # try to determine which type it is, and also extract 1887 # its name. 1888 # 1889 # We're going to parse the next tokens of the same block 1890 # until we find a semi-column or a left parenthesis. 1891 # 1892 # The semi-column corresponds to a variable definition, 1893 # the left-parenthesis to a function definition. 1894 # 1895 # We also assume that the var/func name is the last 1896 # identifier before the terminator. 1897 # 1898 j = i+1 1899 ident = "" 1900 while j < n: 1901 tokid = b.tokens[j].id 1902 if tokid == '(': # a function declaration 1903 state = 3 1904 break 1905 elif tokid == ';': # a variable declaration 1906 state = 2 1907 break 1908 if tokid == tokIDENT: 1909 ident = b.tokens[j].value 1910 j += 1 1911 1912 if j >= n: 1913 # This can only happen when the declaration 1914 # does not end on the current block (e.g. with 1915 # a directive mixed inside it. 1916 # 1917 # We will treat it as malformed because 1918 # it's very hard to recover from this case 1919 # without making our parser much more 1920 # complex. 1921 # 1922 #print "### skip unterminated static '%s'" % ident 1923 break 1924 1925 if ident in knownStatics: 1926 #print "### keep var/func '%s': %s" % (ident,repr(b.tokens[i:j])) 1927 pass 1928 else: 1929 # We're going to skip the tokens for this declaration 1930 #print "### skip variable /func'%s': %s" % (ident,repr(b.tokens[i:j])) 1931 if i > first: 1932 blocks2.append( Block(b.tokens[first:i])) 1933 skipTokens = True 1934 first = n 1935 1936 i = i+1 1937 1938 if i > first: 1939 #print "### final '%s'" % repr(b.tokens[first:i]) 1940 blocks2.append( Block(b.tokens[first:i]) ) 1941 1942 self.blocks = blocks2 1943 1944 def insertDisclaimer(self,disclaimer="/* auto-generated file, DO NOT EDIT */"): 1945 """insert your standard issue disclaimer that this is an 1946 auto-generated file, etc..""" 1947 tokens = CppLineTokenizer( disclaimer ).toTokenList() 1948 tokens = tokens[:-1] # remove trailing tokLN 1949 self.blocks = [ Block(tokens) ] + self.blocks 1950 1951 def replaceTokens(self,replacements=dict()): 1952 """replace tokens according to the given dict 1953 """ 1954 for b in self.blocks: 1955 if not b.isDirective(): 1956 for tok in b.tokens: 1957 if tok.id == tokIDENT: 1958 if tok.value in replacements: 1959 tok.value = replacements[tok.value] 1960 1961class BlockParser: 1962 """a class used to convert an input source file into a BlockList object""" 1963 1964 def __init__(self,tokzer=None): 1965 """initialize a block parser. the input source is provided through a Tokenizer 1966 object""" 1967 self.reset(tokzer) 1968 1969 def reset(self,tokzer): 1970 self.state = 1 1971 self.tokzer = tokzer 1972 1973 def getBlocks(self,tokzer=None): 1974 """tokenize and parse the input source, return a BlockList object 1975 NOTE: empty and line-numbering directives are ignored and removed 1976 from the result. as a consequence, it is possible to have 1977 two successive text blocks in the result""" 1978 # state 0 => in source code 1979 # state 1 => in source code, after a LN 1980 # state 2 => in source code, after LN then some space 1981 state = 1 1982 lastLN = 0 1983 current = [] 1984 blocks = [] 1985 1986 if tokzer == None: 1987 tokzer = self.tokzer 1988 1989 while 1: 1990 tok = tokzer.getToken() 1991 if tok.id == tokEOF: 1992 break 1993 1994 if tok.id == tokLN: 1995 state = 1 1996 current.append(tok) 1997 lastLN = len(current) 1998 1999 elif tok.id == tokSPACE: 2000 if state == 1: 2001 state = 2 2002 current.append(tok) 2003 2004 elif tok.id == "#": 2005 if state > 0: 2006 # this is the start of a directive 2007 2008 if lastLN > 0: 2009 # record previous tokens as text block 2010 block = Block(current[:lastLN]) 2011 blocks.append(block) 2012 lastLN = 0 2013 2014 current = [] 2015 2016 # skip spaces after the # 2017 while 1: 2018 tok = tokzer.getToken() 2019 if tok.id != tokSPACE: 2020 break 2021 2022 if tok.id != tokIDENT: 2023 # empty or line-numbering, ignore it 2024 if tok.id != tokLN and tok.id != tokEOF: 2025 while 1: 2026 tok = tokzer.getToken() 2027 if tok.id == tokLN or tok.id == tokEOF: 2028 break 2029 continue 2030 2031 directive = tok.value 2032 lineno = tok.lineno 2033 2034 # skip spaces 2035 tok = tokzer.getToken() 2036 while tok.id == tokSPACE: 2037 tok = tokzer.getToken() 2038 2039 # then record tokens until LN 2040 dirtokens = [] 2041 while tok.id != tokLN and tok.id != tokEOF: 2042 dirtokens.append(tok) 2043 tok = tokzer.getToken() 2044 2045 block = Block(dirtokens,directive,lineno) 2046 blocks.append(block) 2047 state = 1 2048 2049 else: 2050 state = 0 2051 current.append(tok) 2052 2053 if len(current) > 0: 2054 block = Block(current) 2055 blocks.append(block) 2056 2057 return BlockList(blocks) 2058 2059 def parse(self,tokzer): 2060 return self.getBlocks( tokzer ) 2061 2062 def parseLines(self,lines): 2063 """parse a list of text lines into a BlockList object""" 2064 return self.getBlocks( CppLinesTokenizer(lines) ) 2065 2066 def parseFile(self,path): 2067 """parse a file into a BlockList object""" 2068 file = open(path, "rt") 2069 result = self.getBlocks( CppFileTokenizer(file) ) 2070 file.close() 2071 return result 2072 2073 2074def test_block_parsing(lines,expected): 2075 blocks = BlockParser().parse( CppLinesTokenizer(lines) ) 2076 if len(blocks) != len(expected): 2077 raise BadExpectedToken, "parser.buildBlocks returned '%s' expecting '%s'" \ 2078 % (str(blocks), repr(expected)) 2079 for n in range(len(blocks)): 2080 if str(blocks[n]) != expected[n]: 2081 raise BadExpectedToken, "parser.buildBlocks()[%d] is '%s', expecting '%s'" \ 2082 % (n, str(blocks[n]), expected[n]) 2083 #for block in blocks: 2084 # print block 2085 2086def test_BlockParser(): 2087 test_block_parsing(["#error hello"],["#error hello"]) 2088 test_block_parsing([ "foo", "", "bar" ], [ "foo\n\nbar\n" ]) 2089 test_block_parsing([ "foo", " # ", "bar" ], [ "foo\n","bar\n" ]) 2090 test_block_parsing(\ 2091 [ "foo", " # ", " # /* ahah */ if defined(__KERNEL__) ", "bar", "#endif" ], 2092 [ "foo\n", "#ifdef __KERNEL__", "bar\n", "#endif" ] ) 2093 2094 2095##################################################################################### 2096##################################################################################### 2097##### ##### 2098##### B L O C K L I S T O P T I M I Z A T I O N ##### 2099##### ##### 2100##################################################################################### 2101##################################################################################### 2102 2103def remove_macro_defines( blocks, excludedMacros=set() ): 2104 """remove macro definitions like #define <macroName> ....""" 2105 result = [] 2106 for b in blocks: 2107 macroName = b.isDefine() 2108 if macroName == None or not macroName in excludedMacros: 2109 result.append(b) 2110 2111 return result 2112 2113def find_matching_endif( blocks, i ): 2114 n = len(blocks) 2115 depth = 1 2116 while i < n: 2117 if blocks[i].isDirective(): 2118 dir = blocks[i].directive 2119 if dir in [ "if", "ifndef", "ifdef" ]: 2120 depth += 1 2121 elif depth == 1 and dir in [ "else", "elif" ]: 2122 return i 2123 elif dir == "endif": 2124 depth -= 1 2125 if depth == 0: 2126 return i 2127 i += 1 2128 return i 2129 2130def optimize_if01( blocks ): 2131 """remove the code between #if 0 .. #endif in a list of CppBlocks""" 2132 i = 0 2133 n = len(blocks) 2134 result = [] 2135 while i < n: 2136 j = i 2137 while j < n and not blocks[j].isIf(): 2138 j += 1 2139 if j > i: 2140 D2("appending lines %d to %d" % (blocks[i].lineno, blocks[j-1].lineno)) 2141 result += blocks[i:j] 2142 if j >= n: 2143 break 2144 expr = blocks[j].expr 2145 r = expr.toInt() 2146 if r == None: 2147 result.append(blocks[j]) 2148 i = j + 1 2149 continue 2150 2151 if r == 0: 2152 # if 0 => skip everything until the corresponding #endif 2153 j = find_matching_endif( blocks, j+1 ) 2154 if j >= n: 2155 # unterminated #if 0, finish here 2156 break 2157 dir = blocks[j].directive 2158 if dir == "endif": 2159 D2("remove 'if 0' .. 'endif' (lines %d to %d)" % (blocks[i].lineno, blocks[j].lineno)) 2160 i = j + 1 2161 elif dir == "else": 2162 # convert 'else' into 'if 1' 2163 D2("convert 'if 0' .. 'else' into 'if 1' (lines %d to %d)" % (blocks[i].lineno, blocks[j-1].lineno)) 2164 blocks[j].directive = "if" 2165 blocks[j].expr = CppExpr( CppLineTokenizer("1").toTokenList() ) 2166 i = j 2167 elif dir == "elif": 2168 # convert 'elif' into 'if' 2169 D2("convert 'if 0' .. 'elif' into 'if'") 2170 blocks[j].directive = "if" 2171 i = j 2172 continue 2173 2174 # if 1 => find corresponding endif and remove/transform them 2175 k = find_matching_endif( blocks, j+1 ) 2176 if k >= n: 2177 # unterminated #if 1, finish here 2178 D2("unterminated 'if 1'") 2179 result += blocks[j+1:k] 2180 break 2181 2182 dir = blocks[k].directive 2183 if dir == "endif": 2184 D2("convert 'if 1' .. 'endif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) 2185 result += optimize_if01(blocks[j+1:k]) 2186 i = k+1 2187 elif dir == "else": 2188 # convert 'else' into 'if 0' 2189 D2("convert 'if 1' .. 'else' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) 2190 result += optimize_if01(blocks[j+1:k]) 2191 blocks[k].directive = "if" 2192 blocks[k].expr = CppExpr( CppLineTokenizer("0").toTokenList() ) 2193 i = k 2194 elif dir == "elif": 2195 # convert 'elif' into 'if 0' 2196 D2("convert 'if 1' .. 'elif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) 2197 result += optimize_if01(blocks[j+1:k]) 2198 blocks[k].expr = CppExpr( CppLineTokenizer("0").toTokenList() ) 2199 i = k 2200 return result 2201 2202def test_optimizeAll(): 2203 text = """\ 2204#if 1 2205#define GOOD_1 2206#endif 2207#if 0 2208#define BAD_2 2209#define BAD_3 2210#endif 2211 2212#if 1 2213#define GOOD_2 2214#else 2215#define BAD_4 2216#endif 2217 2218#if 0 2219#define BAD_5 2220#else 2221#define GOOD_3 2222#endif 2223 2224#if 0 2225#if 1 2226#define BAD_6 2227#endif 2228#endif\ 2229""" 2230 2231 expected = """\ 2232#define GOOD_1 2233 2234#define GOOD_2 2235 2236#define GOOD_3 2237 2238""" 2239 2240 print "running test_BlockList.optimizeAll" 2241 out = StringOutput() 2242 lines = string.split(text, '\n') 2243 list = BlockParser().parse( CppLinesTokenizer(lines) ) 2244 #D_setlevel(2) 2245 list.optimizeAll( {"__KERNEL__":kCppUndefinedMacro} ) 2246 #print repr(list) 2247 list.write(out) 2248 if out.get() != expected: 2249 print "KO: macro optimization failed\n" 2250 print "<<<< expecting '", 2251 print expected, 2252 print "'\n>>>> result '" 2253 print out.get(), 2254 print "'\n----" 2255 2256 2257##################################################################################### 2258##################################################################################### 2259##### ##### 2260##### ##### 2261##### ##### 2262##################################################################################### 2263##################################################################################### 2264 2265def runUnitTests(): 2266 """run all unit tests for this program""" 2267 print "running unit tests" 2268 test_CppTokenizer() 2269 test_CppExpr() 2270 test_optimizeAll() 2271 test_BlockParser() 2272 print "OK" 2273 2274if __name__ == "__main__": 2275 runUnitTests() 2276