1# Regular expression patterns for C syntax. 2# 3# None of these patterns has any capturing. However, a number of them 4# have capturing markers compatible with utils.set_capture_groups(). 5 6import textwrap 7 8 9def _ind(text, level=1, edges='both'): 10 indent = ' ' * level 11 text = textwrap.indent(text, indent) 12 if edges == 'pre' or edges == 'both': 13 text = '\n' + indent + text.lstrip() 14 if edges == 'post' or edges == 'both': 15 text = text.rstrip() + '\n' + ' ' * (level - 1) 16 return text 17 18 19####################################### 20# general 21 22HEX = r'(?: [0-9a-zA-Z] )' 23 24STRING_LITERAL = textwrap.dedent(rf''' 25 (?: 26 # character literal 27 (?: 28 ['] [^'] ['] 29 | 30 ['] \\ . ['] 31 | 32 ['] \\x{HEX}{HEX} ['] 33 | 34 ['] \\0\d\d ['] 35 | 36 (?: 37 ['] \\o[01]\d\d ['] 38 | 39 ['] \\o2[0-4]\d ['] 40 | 41 ['] \\o25[0-5] ['] 42 ) 43 ) 44 | 45 # string literal 46 (?: 47 ["] (?: [^"\\]* \\ . )* [^"\\]* ["] 48 ) 49 # end string literal 50 ) 51 ''') 52 53_KEYWORD = textwrap.dedent(r''' 54 (?: 55 \b 56 (?: 57 auto | 58 extern | 59 register | 60 static | 61 typedef | 62 63 const | 64 volatile | 65 66 signed | 67 unsigned | 68 char | 69 short | 70 int | 71 long | 72 float | 73 double | 74 void | 75 76 struct | 77 union | 78 enum | 79 80 goto | 81 return | 82 sizeof | 83 break | 84 continue | 85 if | 86 else | 87 for | 88 do | 89 while | 90 switch | 91 case | 92 default | 93 entry 94 ) 95 \b 96 ) 97 ''') 98KEYWORD = rf''' 99 # keyword 100 {_KEYWORD} 101 # end keyword 102 ''' 103_KEYWORD = ''.join(_KEYWORD.split()) 104 105IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' 106# We use a negative lookahead to filter out keywords. 107STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' 108ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' 109 110 111####################################### 112# types 113 114SIMPLE_TYPE = textwrap.dedent(rf''' 115 # simple type 116 (?: 117 \b 118 (?: 119 void 120 | 121 (?: signed | unsigned ) # implies int 122 | 123 (?: 124 (?: (?: signed | unsigned ) \s+ )? 125 (?: (?: long | short ) \s+ )? 126 (?: char | short | int | long | float | double ) 127 ) 128 ) 129 \b 130 ) 131 # end simple type 132 ''') 133 134COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' 135 136 137####################################### 138# variable declarations 139 140_STORAGE = 'auto register static extern'.split() 141STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )' 142TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' 143PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' 144 145TYPE_SPEC = textwrap.dedent(rf''' 146 # type spec 147 (?: 148 {_ind(SIMPLE_TYPE, 2)} 149 | 150 (?: 151 [_]*typeof[_]* 152 \s* [(] 153 (?: \s* [*&] )* 154 \s* {STRICT_IDENTIFIER} 155 \s* [)] 156 ) 157 | 158 # reference to a compound type 159 (?: 160 {COMPOUND_TYPE_KIND} 161 (?: \s* {ANON_IDENTIFIER} )? 162 ) 163 | 164 # reference to a typedef 165 {STRICT_IDENTIFIER} 166 ) 167 # end type spec 168 ''') 169 170DECLARATOR = textwrap.dedent(rf''' 171 # declarator (possibly abstract) 172 (?: 173 (?: {PTR_QUALIFIER} \s* )* 174 (?: 175 (?: 176 (?: # <IDENTIFIER> 177 {STRICT_IDENTIFIER} 178 ) 179 # Inside the brackets is actually a "constant expression". 180 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 181 ) 182 | 183 (?: 184 [(] \s* 185 (?: # <WRAPPED_IDENTIFIER> 186 {STRICT_IDENTIFIER} 187 ) 188 # Inside the brackets is actually a "constant expression". 189 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 190 \s* [)] 191 ) 192 | 193 # func ptr 194 (?: 195 [(] (?: \s* {PTR_QUALIFIER} )? \s* 196 (?: # <FUNC_IDENTIFIER> 197 {STRICT_IDENTIFIER} 198 ) 199 # Inside the brackets is actually a "constant expression". 200 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 201 \s* [)] 202 # We allow for a single level of paren nesting in parameters. 203 \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] 204 ) 205 ) 206 ) 207 # end declarator 208 ''') 209 210VAR_DECL = textwrap.dedent(rf''' 211 # var decl (and typedef and func return type) 212 (?: 213 (?: 214 (?: # <STORAGE> 215 {STORAGE_CLASS} 216 ) 217 \s* 218 )? 219 (?: 220 (?: # <TYPE_QUAL> 221 {TYPE_QUALIFIER} 222 ) 223 \s* 224 )? 225 (?: 226 (?: # <TYPE_SPEC> 227 {_ind(TYPE_SPEC, 4)} 228 ) 229 ) 230 \s* 231 (?: 232 (?: # <DECLARATOR> 233 {_ind(DECLARATOR, 4)} 234 ) 235 ) 236 ) 237 # end var decl 238 ''') 239 240INITIALIZER = textwrap.dedent(rf''' 241 # initializer 242 (?: 243 (?: 244 [(] 245 # no nested parens (e.g. func ptr) 246 [^)]* 247 [)] 248 \s* 249 )? 250 (?: 251 # a string literal 252 (?: 253 (?: {_ind(STRING_LITERAL, 4)} \s* )* 254 {_ind(STRING_LITERAL, 4)} 255 ) 256 | 257 258 # a simple initializer 259 (?: 260 (?: 261 [^'",;{{]* 262 {_ind(STRING_LITERAL, 4)} 263 )* 264 [^'",;{{]* 265 ) 266 | 267 268 # a struct/array literal 269 (?: 270 # We only expect compound initializers with 271 # single-variable declarations. 272 {{ 273 (?: 274 [^'";]*? 275 {_ind(STRING_LITERAL, 5)} 276 )* 277 [^'";]*? 278 }} 279 (?= \s* ; ) # Note this lookahead. 280 ) 281 ) 282 ) 283 # end initializer 284 ''') 285 286 287####################################### 288# compound type declarations 289 290STRUCT_MEMBER_DECL = textwrap.dedent(rf''' 291 (?: 292 # inline compound type decl 293 (?: 294 (?: # <COMPOUND_TYPE_KIND> 295 {COMPOUND_TYPE_KIND} 296 ) 297 (?: 298 \s+ 299 (?: # <COMPOUND_TYPE_NAME> 300 {STRICT_IDENTIFIER} 301 ) 302 )? 303 \s* {{ 304 ) 305 | 306 (?: 307 # typed member 308 (?: 309 # Technically it doesn't have to have a type... 310 (?: # <SPECIFIER_QUALIFIER> 311 (?: {TYPE_QUALIFIER} \s* )? 312 {_ind(TYPE_SPEC, 5)} 313 ) 314 (?: 315 # If it doesn't have a declarator then it will have 316 # a size and vice versa. 317 \s* 318 (?: # <DECLARATOR> 319 {_ind(DECLARATOR, 6)} 320 ) 321 )? 322 ) 323 324 # sized member 325 (?: 326 \s* [:] \s* 327 (?: # <SIZE> 328 # This is actually a "constant expression". 329 \d+ 330 | 331 [^'",}}]+ 332 ) 333 )? 334 \s* 335 (?: # <ENDING> 336 [,;] 337 ) 338 ) 339 | 340 (?: 341 \s* 342 (?: # <CLOSE> 343 }} 344 ) 345 ) 346 ) 347 ''') 348 349ENUM_MEMBER_DECL = textwrap.dedent(rf''' 350 (?: 351 (?: 352 \s* 353 (?: # <CLOSE> 354 }} 355 ) 356 ) 357 | 358 (?: 359 \s* 360 (?: # <NAME> 361 {IDENTIFIER} 362 ) 363 (?: 364 \s* = \s* 365 (?: # <INIT> 366 # This is actually a "constant expression". 367 {_ind(STRING_LITERAL, 4)} 368 | 369 [^'",}}]+ 370 ) 371 )? 372 \s* 373 (?: # <ENDING> 374 , | }} 375 ) 376 ) 377 ) 378 ''') 379 380 381####################################### 382# statements 383 384SIMPLE_STMT_BODY = textwrap.dedent(rf''' 385 # simple statement body 386 (?: 387 (?: 388 [^'"{{}};]* 389 {_ind(STRING_LITERAL, 3)} 390 )* 391 [^'"{{}};]* 392 #(?= [;{{] ) # Note this lookahead. 393 ) 394 # end simple statement body 395 ''') 396SIMPLE_STMT = textwrap.dedent(rf''' 397 # simple statement 398 (?: 399 (?: # <SIMPLE_STMT> 400 # stmt-inline "initializer" 401 (?: 402 return \b 403 (?: 404 \s* 405 {_ind(INITIALIZER, 5)} 406 )? 407 ) 408 | 409 # variable assignment 410 (?: 411 (?: [*] \s* )? 412 (?: 413 {STRICT_IDENTIFIER} \s* 414 (?: . | -> ) \s* 415 )* 416 {STRICT_IDENTIFIER} 417 (?: \s* \[ \s* \d+ \s* \] )? 418 \s* = \s* 419 {_ind(INITIALIZER, 4)} 420 ) 421 | 422 # catchall return statement 423 (?: 424 return \b 425 (?: 426 (?: 427 [^'";]* 428 {_ind(STRING_LITERAL, 6)} 429 )* 430 \s* [^'";]* 431 )? 432 ) 433 | 434 # simple statement 435 (?: 436 {_ind(SIMPLE_STMT_BODY, 4)} 437 ) 438 ) 439 \s* 440 (?: # <SIMPLE_ENDING> 441 ; 442 ) 443 ) 444 # end simple statement 445 ''') 446COMPOUND_STMT = textwrap.dedent(rf''' 447 # compound statement 448 (?: 449 \b 450 (?: 451 (?: 452 (?: # <COMPOUND_BARE> 453 else | do 454 ) 455 \b 456 ) 457 | 458 (?: 459 (?: # <COMPOUND_LABELED> 460 (?: 461 case \b 462 (?: 463 [^'":]* 464 {_ind(STRING_LITERAL, 7)} 465 )* 466 \s* [^'":]* 467 ) 468 | 469 default 470 | 471 {STRICT_IDENTIFIER} 472 ) 473 \s* [:] 474 ) 475 | 476 (?: 477 (?: # <COMPOUND_PAREN> 478 for | while | if | switch 479 ) 480 \s* (?= [(] ) # Note this lookahead. 481 ) 482 ) 483 \s* 484 ) 485 # end compound statement 486 ''') 487 488 489####################################### 490# function bodies 491 492LOCAL = textwrap.dedent(rf''' 493 (?: 494 # an empty statement 495 (?: # <EMPTY> 496 ; 497 ) 498 | 499 # inline type decl 500 (?: 501 (?: 502 (?: # <INLINE_LEADING> 503 [^;{{}}]+? 504 ) 505 \s* 506 )? 507 (?: # <INLINE_PRE> 508 (?: {STORAGE_CLASS} \s* )? 509 (?: {TYPE_QUALIFIER} \s* )? 510 )? # </INLINE_PRE> 511 (?: # <INLINE_KIND> 512 {COMPOUND_TYPE_KIND} 513 ) 514 (?: 515 \s+ 516 (?: # <INLINE_NAME> 517 {STRICT_IDENTIFIER} 518 ) 519 )? 520 \s* {{ 521 ) 522 | 523 # var decl 524 (?: 525 (?: # <STORAGE> 526 {STORAGE_CLASS} 527 )? # </STORAGE> 528 (?: 529 \s* 530 (?: # <VAR_DECL> 531 {_ind(VAR_DECL, 5)} 532 ) 533 ) 534 (?: 535 (?: 536 # initializer 537 # We expect only basic initializers. 538 \s* = \s* 539 (?: # <VAR_INIT> 540 {_ind(INITIALIZER, 6)} 541 ) 542 )? 543 (?: 544 \s* 545 (?: # <VAR_ENDING> 546 [,;] 547 ) 548 ) 549 ) 550 ) 551 | 552 {_ind(COMPOUND_STMT, 2)} 553 | 554 # start-of-block 555 (?: 556 (?: # <BLOCK_LEADING> 557 (?: 558 [^'"{{}};]* 559 {_ind(STRING_LITERAL, 5)} 560 )* 561 [^'"{{}};]* 562 # Presumably we will not see "== {{". 563 [^\s='"{{}});] 564 \s* 565 )? # </BLOCK_LEADING> 566 (?: # <BLOCK_OPEN> 567 {{ 568 ) 569 ) 570 | 571 {_ind(SIMPLE_STMT, 2)} 572 | 573 # end-of-block 574 (?: # <BLOCK_CLOSE> 575 }} 576 ) 577 ) 578 ''') 579 580LOCAL_STATICS = textwrap.dedent(rf''' 581 (?: 582 # inline type decl 583 (?: 584 (?: 585 (?: # <INLINE_LEADING> 586 [^;{{}}]+? 587 ) 588 \s* 589 )? 590 (?: # <INLINE_PRE> 591 (?: {STORAGE_CLASS} \s* )? 592 (?: {TYPE_QUALIFIER} \s* )? 593 )? 594 (?: # <INLINE_KIND> 595 {COMPOUND_TYPE_KIND} 596 ) 597 (?: 598 \s+ 599 (?: # <INLINE_NAME> 600 {STRICT_IDENTIFIER} 601 ) 602 )? 603 \s* {{ 604 ) 605 | 606 # var decl 607 (?: 608 # We only look for static variables. 609 (?: # <STATIC_DECL> 610 static \b 611 (?: \s* {TYPE_QUALIFIER} )? 612 \s* {_ind(TYPE_SPEC, 4)} 613 \s* {_ind(DECLARATOR, 4)} 614 ) 615 \s* 616 (?: 617 (?: # <STATIC_INIT> 618 = \s* 619 {_ind(INITIALIZER, 4)} 620 \s* 621 [,;{{] 622 ) 623 | 624 (?: # <STATIC_ENDING> 625 [,;] 626 ) 627 ) 628 ) 629 | 630 # everything else 631 (?: 632 (?: # <DELIM_LEADING> 633 (?: 634 [^'"{{}};]* 635 {_ind(STRING_LITERAL, 4)} 636 )* 637 \s* [^'"{{}};]* 638 ) 639 (?: 640 (?: # <BLOCK_OPEN> 641 {{ 642 ) 643 | 644 (?: # <BLOCK_CLOSE> 645 }} 646 ) 647 | 648 (?: # <STMT_END> 649 ; 650 ) 651 ) 652 ) 653 ) 654 ''') 655 656 657####################################### 658# global declarations 659 660GLOBAL = textwrap.dedent(rf''' 661 (?: 662 # an empty statement 663 (?: # <EMPTY> 664 ; 665 ) 666 | 667 668 # compound type decl (maybe inline) 669 (?: 670 (?: 671 (?: # <COMPOUND_LEADING> 672 [^;{{}}]+? 673 ) 674 \s* 675 )? 676 (?: # <COMPOUND_KIND> 677 {COMPOUND_TYPE_KIND} 678 ) 679 (?: 680 \s+ 681 (?: # <COMPOUND_NAME> 682 {STRICT_IDENTIFIER} 683 ) 684 )? 685 \s* {{ 686 ) 687 | 688 # bogus inline decl artifact 689 # This simplifies resolving the relative syntactic ambiguity of 690 # inline structs. 691 (?: 692 (?: # <FORWARD_KIND> 693 {COMPOUND_TYPE_KIND} 694 ) 695 \s* 696 (?: # <FORWARD_NAME> 697 {ANON_IDENTIFIER} 698 ) 699 (?: # <MAYBE_INLINE_ACTUAL> 700 [^=,;({{[*\]]* 701 [=,;({{] 702 ) 703 ) 704 | 705 706 # typedef 707 (?: 708 \b typedef \b \s* 709 (?: # <TYPEDEF_DECL> 710 {_ind(VAR_DECL, 4)} 711 ) 712 (?: 713 # We expect no inline type definitions in the parameters. 714 \s* [(] \s* 715 (?: # <TYPEDEF_FUNC_PARAMS> 716 [^{{;]* 717 ) 718 \s* [)] 719 )? 720 \s* ; 721 ) 722 | 723 724 # func decl/definition & var decls 725 # XXX dedicated pattern for funcs (more restricted)? 726 (?: 727 (?: 728 (?: # <VAR_STORAGE> 729 {STORAGE_CLASS} 730 ) 731 \s* 732 )? 733 (?: 734 (?: # <FUNC_INLINE> 735 \b inline \b 736 ) 737 \s* 738 )? 739 (?: # <VAR_DECL> 740 {_ind(VAR_DECL, 4)} 741 ) 742 (?: 743 # func decl / definition 744 (?: 745 (?: 746 # We expect no inline type definitions in the parameters. 747 \s* [(] \s* 748 (?: # <FUNC_PARAMS> 749 [^{{;]* 750 ) 751 \s* [)] \s* 752 (?: # <FUNC_DELIM> 753 [{{;] 754 ) 755 ) 756 | 757 (?: 758 # This is some old-school syntax! 759 \s* [(] \s* 760 # We throw away the bare names: 761 {STRICT_IDENTIFIER} 762 (?: \s* , \s* {STRICT_IDENTIFIER} )* 763 \s* [)] \s* 764 765 # We keep the trailing param declarations: 766 (?: # <FUNC_LEGACY_PARAMS> 767 # There's at least one! 768 (?: {TYPE_QUALIFIER} \s* )? 769 {_ind(TYPE_SPEC, 7)} 770 \s* 771 {_ind(DECLARATOR, 7)} 772 \s* ; 773 (?: 774 \s* 775 (?: {TYPE_QUALIFIER} \s* )? 776 {_ind(TYPE_SPEC, 8)} 777 \s* 778 {_ind(DECLARATOR, 8)} 779 \s* ; 780 )* 781 ) 782 \s* {{ 783 ) 784 ) 785 | 786 # var / typedef 787 (?: 788 (?: 789 # initializer 790 # We expect only basic initializers. 791 \s* = \s* 792 (?: # <VAR_INIT> 793 {_ind(INITIALIZER, 6)} 794 ) 795 )? 796 \s* 797 (?: # <VAR_ENDING> 798 [,;] 799 ) 800 ) 801 ) 802 ) 803 ) 804 ''') 805