1# Regular expression patterns for C syntax. 2# 3# None of these patterns has any capturing. However, a number of them 4# have capturing markers compatible with utils.set_capture_groups(). 5 6import textwrap 7 8 9def _ind(text, level=1, edges='both'): 10 indent = ' ' * level 11 text = textwrap.indent(text, indent) 12 if edges == 'pre' or edges == 'both': 13 text = '\n' + indent + text.lstrip() 14 if edges == 'post' or edges == 'both': 15 text = text.rstrip() + '\n' + ' ' * (level - 1) 16 return text 17 18 19####################################### 20# general 21 22HEX = r'(?: [0-9a-zA-Z] )' 23 24STRING_LITERAL = textwrap.dedent(rf''' 25 (?: 26 # character literal 27 (?: 28 ['] [^'] ['] 29 | 30 ['] \\ . ['] 31 | 32 ['] \\x{HEX}{HEX} ['] 33 | 34 ['] \\0\d\d ['] 35 | 36 (?: 37 ['] \\o[01]\d\d ['] 38 | 39 ['] \\o2[0-4]\d ['] 40 | 41 ['] \\o25[0-5] ['] 42 ) 43 ) 44 | 45 # string literal 46 (?: 47 ["] (?: [^"\\]* \\ . )* [^"\\]* ["] 48 ) 49 # end string literal 50 ) 51 ''') 52 53_KEYWORD = textwrap.dedent(r''' 54 (?: 55 \b 56 (?: 57 auto | 58 extern | 59 register | 60 static | 61 typedef | 62 63 const | 64 volatile | 65 66 signed | 67 unsigned | 68 char | 69 short | 70 int | 71 long | 72 float | 73 double | 74 void | 75 76 struct | 77 union | 78 enum | 79 80 goto | 81 return | 82 sizeof | 83 break | 84 continue | 85 if | 86 else | 87 for | 88 do | 89 while | 90 switch | 91 case | 92 default | 93 entry 94 ) 95 \b 96 ) 97 ''') 98KEYWORD = rf''' 99 # keyword 100 {_KEYWORD} 101 # end keyword 102 ''' 103_KEYWORD = ''.join(_KEYWORD.split()) 104 105IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' 106# We use a negative lookahead to filter out keywords. 107STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' 108ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' 109 110 111####################################### 112# types 113 114SIMPLE_TYPE = textwrap.dedent(rf''' 115 # simple type 116 (?: 117 \b 118 (?: 119 void 120 | 121 (?: signed | unsigned ) # implies int 122 | 123 (?: 124 (?: (?: signed | unsigned ) \s+ )? 125 (?: (?: long | short ) \s+ )? 126 (?: char | short | int | long | float | double ) 127 ) 128 ) 129 \b 130 ) 131 # end simple type 132 ''') 133 134COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' 135 136 137####################################### 138# variable declarations 139 140_STORAGE = 'auto register static extern'.split() 141STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )' 142TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' 143PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' 144 145TYPE_SPEC = textwrap.dedent(rf''' 146 # type spec 147 (?: 148 {_ind(SIMPLE_TYPE, 2)} 149 | 150 (?: 151 [_]*typeof[_]* 152 \s* [(] 153 (?: \s* [*&] )* 154 \s* {STRICT_IDENTIFIER} 155 \s* [)] 156 ) 157 | 158 # reference to a compound type 159 (?: 160 {COMPOUND_TYPE_KIND} 161 (?: \s* {ANON_IDENTIFIER} )? 162 ) 163 | 164 # reference to a typedef 165 {STRICT_IDENTIFIER} 166 ) 167 # end type spec 168 ''') 169 170DECLARATOR = textwrap.dedent(rf''' 171 # declarator (possibly abstract) 172 (?: 173 (?: {PTR_QUALIFIER} \s* )* 174 (?: 175 (?: 176 (?: # <IDENTIFIER> 177 {STRICT_IDENTIFIER} 178 ) 179 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 180 ) 181 | 182 (?: 183 [(] \s* 184 (?: # <WRAPPED_IDENTIFIER> 185 {STRICT_IDENTIFIER} 186 ) 187 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 188 \s* [)] 189 ) 190 | 191 # func ptr 192 (?: 193 [(] (?: \s* {PTR_QUALIFIER} )? \s* 194 (?: # <FUNC_IDENTIFIER> 195 {STRICT_IDENTIFIER} 196 ) 197 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 198 \s* [)] 199 # We allow for a single level of paren nesting in parameters. 200 \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] 201 ) 202 ) 203 ) 204 # end declarator 205 ''') 206 207VAR_DECL = textwrap.dedent(rf''' 208 # var decl (and typedef and func return type) 209 (?: 210 (?: 211 (?: # <STORAGE> 212 {STORAGE_CLASS} 213 ) 214 \s* 215 )? 216 (?: 217 (?: # <TYPE_QUAL> 218 {TYPE_QUALIFIER} 219 ) 220 \s* 221 )? 222 (?: 223 (?: # <TYPE_SPEC> 224 {_ind(TYPE_SPEC, 4)} 225 ) 226 ) 227 \s* 228 (?: 229 (?: # <DECLARATOR> 230 {_ind(DECLARATOR, 4)} 231 ) 232 ) 233 ) 234 # end var decl 235 ''') 236 237INITIALIZER = textwrap.dedent(rf''' 238 # initializer 239 (?: 240 (?: 241 [(] 242 # no nested parens (e.g. func ptr) 243 [^)]* 244 [)] 245 \s* 246 )? 247 (?: 248 # a string literal 249 (?: 250 (?: {_ind(STRING_LITERAL, 4)} \s* )* 251 {_ind(STRING_LITERAL, 4)} 252 ) 253 | 254 255 # a simple initializer 256 (?: 257 (?: 258 [^'",;{{]* 259 {_ind(STRING_LITERAL, 4)} 260 )* 261 [^'",;{{]* 262 ) 263 | 264 265 # a struct/array literal 266 (?: 267 # We only expect compound initializers with 268 # single-variable declarations. 269 {{ 270 (?: 271 [^'";]*? 272 {_ind(STRING_LITERAL, 5)} 273 )* 274 [^'";]*? 275 }} 276 (?= \s* ; ) # Note this lookahead. 277 ) 278 ) 279 ) 280 # end initializer 281 ''') 282 283 284####################################### 285# compound type declarations 286 287STRUCT_MEMBER_DECL = textwrap.dedent(rf''' 288 (?: 289 # inline compound type decl 290 (?: 291 (?: # <COMPOUND_TYPE_KIND> 292 {COMPOUND_TYPE_KIND} 293 ) 294 (?: 295 \s+ 296 (?: # <COMPOUND_TYPE_NAME> 297 {STRICT_IDENTIFIER} 298 ) 299 )? 300 \s* {{ 301 ) 302 | 303 (?: 304 # typed member 305 (?: 306 # Technically it doesn't have to have a type... 307 (?: # <SPECIFIER_QUALIFIER> 308 (?: {TYPE_QUALIFIER} \s* )? 309 {_ind(TYPE_SPEC, 5)} 310 ) 311 (?: 312 # If it doesn't have a declarator then it will have 313 # a size and vice versa. 314 \s* 315 (?: # <DECLARATOR> 316 {_ind(DECLARATOR, 6)} 317 ) 318 )? 319 ) 320 321 # sized member 322 (?: 323 \s* [:] \s* 324 (?: # <SIZE> 325 \d+ 326 ) 327 )? 328 \s* 329 (?: # <ENDING> 330 [,;] 331 ) 332 ) 333 | 334 (?: 335 \s* 336 (?: # <CLOSE> 337 }} 338 ) 339 ) 340 ) 341 ''') 342 343ENUM_MEMBER_DECL = textwrap.dedent(rf''' 344 (?: 345 (?: 346 \s* 347 (?: # <CLOSE> 348 }} 349 ) 350 ) 351 | 352 (?: 353 \s* 354 (?: # <NAME> 355 {IDENTIFIER} 356 ) 357 (?: 358 \s* = \s* 359 (?: # <INIT> 360 {_ind(STRING_LITERAL, 4)} 361 | 362 [^'",}}]+ 363 ) 364 )? 365 \s* 366 (?: # <ENDING> 367 , | }} 368 ) 369 ) 370 ) 371 ''') 372 373 374####################################### 375# statements 376 377SIMPLE_STMT_BODY = textwrap.dedent(rf''' 378 # simple statement body 379 (?: 380 (?: 381 [^'"{{}};]* 382 {_ind(STRING_LITERAL, 3)} 383 )* 384 [^'"{{}};]* 385 #(?= [;{{] ) # Note this lookahead. 386 ) 387 # end simple statement body 388 ''') 389SIMPLE_STMT = textwrap.dedent(rf''' 390 # simple statement 391 (?: 392 (?: # <SIMPLE_STMT> 393 # stmt-inline "initializer" 394 (?: 395 return \b 396 (?: 397 \s* 398 {_ind(INITIALIZER, 5)} 399 )? 400 ) 401 | 402 # variable assignment 403 (?: 404 (?: [*] \s* )? 405 (?: 406 {STRICT_IDENTIFIER} \s* 407 (?: . | -> ) \s* 408 )* 409 {STRICT_IDENTIFIER} 410 (?: \s* \[ \s* \d+ \s* \] )? 411 \s* = \s* 412 {_ind(INITIALIZER, 4)} 413 ) 414 | 415 # catchall return statement 416 (?: 417 return \b 418 (?: 419 (?: 420 [^'";]* 421 {_ind(STRING_LITERAL, 6)} 422 )* 423 \s* [^'";]* 424 )? 425 ) 426 | 427 # simple statement 428 (?: 429 {_ind(SIMPLE_STMT_BODY, 4)} 430 ) 431 ) 432 \s* 433 (?: # <SIMPLE_ENDING> 434 ; 435 ) 436 ) 437 # end simple statement 438 ''') 439COMPOUND_STMT = textwrap.dedent(rf''' 440 # compound statement 441 (?: 442 \b 443 (?: 444 (?: 445 (?: # <COMPOUND_BARE> 446 else | do 447 ) 448 \b 449 ) 450 | 451 (?: 452 (?: # <COMPOUND_LABELED> 453 (?: 454 case \b 455 (?: 456 [^'":]* 457 {_ind(STRING_LITERAL, 7)} 458 )* 459 \s* [^'":]* 460 ) 461 | 462 default 463 | 464 {STRICT_IDENTIFIER} 465 ) 466 \s* [:] 467 ) 468 | 469 (?: 470 (?: # <COMPOUND_PAREN> 471 for | while | if | switch 472 ) 473 \s* (?= [(] ) # Note this lookahead. 474 ) 475 ) 476 \s* 477 ) 478 # end compound statement 479 ''') 480 481 482####################################### 483# function bodies 484 485LOCAL = textwrap.dedent(rf''' 486 (?: 487 # an empty statement 488 (?: # <EMPTY> 489 ; 490 ) 491 | 492 # inline type decl 493 (?: 494 (?: 495 (?: # <INLINE_LEADING> 496 [^;{{}}]+? 497 ) 498 \s* 499 )? 500 (?: # <INLINE_PRE> 501 (?: {STORAGE_CLASS} \s* )? 502 (?: {TYPE_QUALIFIER} \s* )? 503 )? # </INLINE_PRE> 504 (?: # <INLINE_KIND> 505 {COMPOUND_TYPE_KIND} 506 ) 507 (?: 508 \s+ 509 (?: # <INLINE_NAME> 510 {STRICT_IDENTIFIER} 511 ) 512 )? 513 \s* {{ 514 ) 515 | 516 # var decl 517 (?: 518 (?: # <STORAGE> 519 {STORAGE_CLASS} 520 )? # </STORAGE> 521 (?: 522 \s* 523 (?: # <VAR_DECL> 524 {_ind(VAR_DECL, 5)} 525 ) 526 ) 527 (?: 528 (?: 529 # initializer 530 # We expect only basic initializers. 531 \s* = \s* 532 (?: # <VAR_INIT> 533 {_ind(INITIALIZER, 6)} 534 ) 535 )? 536 (?: 537 \s* 538 (?: # <VAR_ENDING> 539 [,;] 540 ) 541 ) 542 ) 543 ) 544 | 545 {_ind(COMPOUND_STMT, 2)} 546 | 547 # start-of-block 548 (?: 549 (?: # <BLOCK_LEADING> 550 (?: 551 [^'"{{}};]* 552 {_ind(STRING_LITERAL, 5)} 553 )* 554 [^'"{{}};]* 555 # Presumably we will not see "== {{". 556 [^\s='"{{}});] 557 \s* 558 )? # </BLOCK_LEADING> 559 (?: # <BLOCK_OPEN> 560 {{ 561 ) 562 ) 563 | 564 {_ind(SIMPLE_STMT, 2)} 565 | 566 # end-of-block 567 (?: # <BLOCK_CLOSE> 568 }} 569 ) 570 ) 571 ''') 572 573LOCAL_STATICS = textwrap.dedent(rf''' 574 (?: 575 # inline type decl 576 (?: 577 (?: 578 (?: # <INLINE_LEADING> 579 [^;{{}}]+? 580 ) 581 \s* 582 )? 583 (?: # <INLINE_PRE> 584 (?: {STORAGE_CLASS} \s* )? 585 (?: {TYPE_QUALIFIER} \s* )? 586 )? 587 (?: # <INLINE_KIND> 588 {COMPOUND_TYPE_KIND} 589 ) 590 (?: 591 \s+ 592 (?: # <INLINE_NAME> 593 {STRICT_IDENTIFIER} 594 ) 595 )? 596 \s* {{ 597 ) 598 | 599 # var decl 600 (?: 601 # We only look for static variables. 602 (?: # <STATIC_DECL> 603 static \b 604 (?: \s* {TYPE_QUALIFIER} )? 605 \s* {_ind(TYPE_SPEC, 4)} 606 \s* {_ind(DECLARATOR, 4)} 607 ) 608 \s* 609 (?: 610 (?: # <STATIC_INIT> 611 = \s* 612 {_ind(INITIALIZER, 4)} 613 \s* 614 [,;{{] 615 ) 616 | 617 (?: # <STATIC_ENDING> 618 [,;] 619 ) 620 ) 621 ) 622 | 623 # everything else 624 (?: 625 (?: # <DELIM_LEADING> 626 (?: 627 [^'"{{}};]* 628 {_ind(STRING_LITERAL, 4)} 629 )* 630 \s* [^'"{{}};]* 631 ) 632 (?: 633 (?: # <BLOCK_OPEN> 634 {{ 635 ) 636 | 637 (?: # <BLOCK_CLOSE> 638 }} 639 ) 640 | 641 (?: # <STMT_END> 642 ; 643 ) 644 ) 645 ) 646 ) 647 ''') 648 649 650####################################### 651# global declarations 652 653GLOBAL = textwrap.dedent(rf''' 654 (?: 655 # an empty statement 656 (?: # <EMPTY> 657 ; 658 ) 659 | 660 661 # compound type decl (maybe inline) 662 (?: 663 (?: 664 (?: # <COMPOUND_LEADING> 665 [^;{{}}]+? 666 ) 667 \s* 668 )? 669 (?: # <COMPOUND_KIND> 670 {COMPOUND_TYPE_KIND} 671 ) 672 (?: 673 \s+ 674 (?: # <COMPOUND_NAME> 675 {STRICT_IDENTIFIER} 676 ) 677 )? 678 \s* {{ 679 ) 680 | 681 # bogus inline decl artifact 682 # This simplifies resolving the relative syntactic ambiguity of 683 # inline structs. 684 (?: 685 (?: # <FORWARD_KIND> 686 {COMPOUND_TYPE_KIND} 687 ) 688 \s* 689 (?: # <FORWARD_NAME> 690 {ANON_IDENTIFIER} 691 ) 692 (?: # <MAYBE_INLINE_ACTUAL> 693 [^=,;({{[*\]]* 694 [=,;({{] 695 ) 696 ) 697 | 698 699 # typedef 700 (?: 701 \b typedef \b \s* 702 (?: # <TYPEDEF_DECL> 703 {_ind(VAR_DECL, 4)} 704 ) 705 (?: 706 # We expect no inline type definitions in the parameters. 707 \s* [(] \s* 708 (?: # <TYPEDEF_FUNC_PARAMS> 709 [^{{;]* 710 ) 711 \s* [)] 712 )? 713 \s* ; 714 ) 715 | 716 717 # func decl/definition & var decls 718 # XXX dedicated pattern for funcs (more restricted)? 719 (?: 720 (?: 721 (?: # <VAR_STORAGE> 722 {STORAGE_CLASS} 723 ) 724 \s* 725 )? 726 (?: 727 (?: # <FUNC_INLINE> 728 \b inline \b 729 ) 730 \s* 731 )? 732 (?: # <VAR_DECL> 733 {_ind(VAR_DECL, 4)} 734 ) 735 (?: 736 # func decl / definition 737 (?: 738 (?: 739 # We expect no inline type definitions in the parameters. 740 \s* [(] \s* 741 (?: # <FUNC_PARAMS> 742 [^{{;]* 743 ) 744 \s* [)] \s* 745 (?: # <FUNC_DELIM> 746 [{{;] 747 ) 748 ) 749 | 750 (?: 751 # This is some old-school syntax! 752 \s* [(] \s* 753 # We throw away the bare names: 754 {STRICT_IDENTIFIER} 755 (?: \s* , \s* {STRICT_IDENTIFIER} )* 756 \s* [)] \s* 757 758 # We keep the trailing param declarations: 759 (?: # <FUNC_LEGACY_PARAMS> 760 # There's at least one! 761 (?: {TYPE_QUALIFIER} \s* )? 762 {_ind(TYPE_SPEC, 7)} 763 \s* 764 {_ind(DECLARATOR, 7)} 765 \s* ; 766 (?: 767 \s* 768 (?: {TYPE_QUALIFIER} \s* )? 769 {_ind(TYPE_SPEC, 8)} 770 \s* 771 {_ind(DECLARATOR, 8)} 772 \s* ; 773 )* 774 ) 775 \s* {{ 776 ) 777 ) 778 | 779 # var / typedef 780 (?: 781 (?: 782 # initializer 783 # We expect only basic initializers. 784 \s* = \s* 785 (?: # <VAR_INIT> 786 {_ind(INITIALIZER, 6)} 787 ) 788 )? 789 \s* 790 (?: # <VAR_ENDING> 791 [,;] 792 ) 793 ) 794 ) 795 ) 796 ) 797 ''') 798