1# Regular expression patterns for C syntax. 2# 3# None of these patterns has any capturing. However, a number of them 4# have capturing markers compatible with utils.set_capture_groups(). 5 6import textwrap 7 8 9def _ind(text, level=1, edges='both'): 10 indent = ' ' * level 11 text = textwrap.indent(text, indent) 12 if edges == 'pre' or edges == 'both': 13 text = '\n' + indent + text.lstrip() 14 if edges == 'post' or edges == 'both': 15 text = text.rstrip() + '\n' + ' ' * (level - 1) 16 return text 17 18 19####################################### 20# general 21 22HEX = r'(?: [0-9a-zA-Z] )' 23 24STRING_LITERAL = textwrap.dedent(rf''' 25 (?: 26 # character literal 27 (?: 28 ['] [^'] ['] 29 | 30 ['] \\ . ['] 31 | 32 ['] \\x{HEX}{HEX} ['] 33 | 34 ['] \\0\d\d ['] 35 | 36 (?: 37 ['] \\o[01]\d\d ['] 38 | 39 ['] \\o2[0-4]\d ['] 40 | 41 ['] \\o25[0-5] ['] 42 ) 43 ) 44 | 45 # string literal 46 (?: 47 ["] (?: [^"\\]* \\ . )* [^"\\]* ["] 48 ) 49 # end string literal 50 ) 51 ''') 52 53_KEYWORD = textwrap.dedent(r''' 54 (?: 55 \b 56 (?: 57 auto | 58 extern | 59 register | 60 static | 61 _Thread_local | 62 typedef | 63 64 const | 65 volatile | 66 67 signed | 68 unsigned | 69 char | 70 short | 71 int | 72 long | 73 float | 74 double | 75 void | 76 77 struct | 78 union | 79 enum | 80 81 goto | 82 return | 83 sizeof | 84 break | 85 continue | 86 if | 87 else | 88 for | 89 do | 90 while | 91 switch | 92 case | 93 default | 94 entry 95 ) 96 \b 97 ) 98 ''') 99KEYWORD = rf''' 100 # keyword 101 {_KEYWORD} 102 # end keyword 103 ''' 104_KEYWORD = ''.join(_KEYWORD.split()) 105 106IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' 107# We use a negative lookahead to filter out keywords. 108STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' 109ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' 110 111 112####################################### 113# types 114 115SIMPLE_TYPE = textwrap.dedent(rf''' 116 # simple type 117 (?: 118 \b 119 (?: 120 void 121 | 122 (?: signed | unsigned ) # implies int 123 | 124 (?: 125 (?: (?: signed | unsigned ) \s+ )? 126 (?: (?: long | short ) \s+ )? 127 (?: char | short | int | long | float | double ) 128 ) 129 ) 130 \b 131 ) 132 # end simple type 133 ''') 134 135COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' 136 137 138####################################### 139# variable declarations 140 141_STORAGE = 'auto register static extern _Thread_local'.split() 142STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )' 143TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' 144PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' 145 146TYPE_SPEC = textwrap.dedent(rf''' 147 # type spec 148 (?: 149 {_ind(SIMPLE_TYPE, 2)} 150 | 151 (?: 152 [_]*typeof[_]* 153 \s* [(] 154 (?: \s* [*&] )* 155 \s* {STRICT_IDENTIFIER} 156 \s* [)] 157 ) 158 | 159 # reference to a compound type 160 (?: 161 {COMPOUND_TYPE_KIND} 162 (?: \s* {ANON_IDENTIFIER} )? 163 ) 164 | 165 # reference to a typedef 166 {STRICT_IDENTIFIER} 167 ) 168 # end type spec 169 ''') 170 171DECLARATOR = textwrap.dedent(rf''' 172 # declarator (possibly abstract) 173 (?: 174 (?: {PTR_QUALIFIER} \s* )* 175 (?: 176 (?: 177 (?: # <IDENTIFIER> 178 {STRICT_IDENTIFIER} 179 ) 180 # Inside the brackets is actually a "constant expression". 181 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 182 ) 183 | 184 (?: 185 [(] \s* 186 (?: # <WRAPPED_IDENTIFIER> 187 {STRICT_IDENTIFIER} 188 ) 189 # Inside the brackets is actually a "constant expression". 190 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 191 \s* [)] 192 ) 193 | 194 # func ptr 195 (?: 196 [(] (?: \s* {PTR_QUALIFIER} )? \s* 197 (?: # <FUNC_IDENTIFIER> 198 {STRICT_IDENTIFIER} 199 ) 200 # Inside the brackets is actually a "constant expression". 201 (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays 202 \s* [)] 203 # We allow for a single level of paren nesting in parameters. 204 \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] 205 ) 206 ) 207 ) 208 # end declarator 209 ''') 210 211VAR_DECL = textwrap.dedent(rf''' 212 # var decl (and typedef and func return type) 213 (?: 214 (?: 215 (?: # <STORAGE> 216 {STORAGE_CLASS} 217 ) 218 \s* 219 )? 220 (?: 221 (?: # <TYPE_QUAL> 222 {TYPE_QUALIFIER} 223 ) 224 \s* 225 )? 226 (?: 227 (?: # <TYPE_SPEC> 228 {_ind(TYPE_SPEC, 4)} 229 ) 230 ) 231 \s* 232 (?: 233 (?: # <DECLARATOR> 234 {_ind(DECLARATOR, 4)} 235 ) 236 ) 237 ) 238 # end var decl 239 ''') 240 241INITIALIZER = textwrap.dedent(rf''' 242 # initializer 243 (?: 244 (?: 245 [(] 246 # no nested parens (e.g. func ptr) 247 [^)]* 248 [)] 249 \s* 250 )? 251 (?: 252 # a string literal 253 (?: 254 (?: {_ind(STRING_LITERAL, 4)} \s* )* 255 {_ind(STRING_LITERAL, 4)} 256 ) 257 | 258 259 # a simple initializer 260 (?: 261 (?: 262 [^'",;{{]* 263 {_ind(STRING_LITERAL, 4)} 264 )* 265 [^'",;{{]* 266 ) 267 | 268 269 # a struct/array literal 270 (?: 271 # We only expect compound initializers with 272 # single-variable declarations. 273 {{ 274 (?: 275 [^'";]*? 276 {_ind(STRING_LITERAL, 5)} 277 )* 278 [^'";]*? 279 }} 280 (?= \s* ; ) # Note this lookahead. 281 ) 282 ) 283 ) 284 # end initializer 285 ''') 286 287 288####################################### 289# compound type declarations 290 291STRUCT_MEMBER_DECL = textwrap.dedent(rf''' 292 (?: 293 # inline compound type decl 294 (?: 295 (?: # <COMPOUND_TYPE_KIND> 296 {COMPOUND_TYPE_KIND} 297 ) 298 (?: 299 \s+ 300 (?: # <COMPOUND_TYPE_NAME> 301 {STRICT_IDENTIFIER} 302 ) 303 )? 304 \s* {{ 305 ) 306 | 307 (?: 308 # typed member 309 (?: 310 # Technically it doesn't have to have a type... 311 (?: # <SPECIFIER_QUALIFIER> 312 (?: {TYPE_QUALIFIER} \s* )? 313 {_ind(TYPE_SPEC, 5)} 314 ) 315 (?: 316 # If it doesn't have a declarator then it will have 317 # a size and vice versa. 318 \s* 319 (?: # <DECLARATOR> 320 {_ind(DECLARATOR, 6)} 321 ) 322 )? 323 ) 324 325 # sized member 326 (?: 327 \s* [:] \s* 328 (?: # <SIZE> 329 # This is actually a "constant expression". 330 \d+ 331 | 332 [^'",}}]+ 333 ) 334 )? 335 \s* 336 (?: # <ENDING> 337 [,;] 338 ) 339 ) 340 | 341 (?: 342 \s* 343 (?: # <CLOSE> 344 }} 345 ) 346 ) 347 ) 348 ''') 349 350ENUM_MEMBER_DECL = textwrap.dedent(rf''' 351 (?: 352 (?: 353 \s* 354 (?: # <CLOSE> 355 }} 356 ) 357 ) 358 | 359 (?: 360 \s* 361 (?: # <NAME> 362 {IDENTIFIER} 363 ) 364 (?: 365 \s* = \s* 366 (?: # <INIT> 367 # This is actually a "constant expression". 368 {_ind(STRING_LITERAL, 4)} 369 | 370 [^'",}}]+ 371 ) 372 )? 373 \s* 374 (?: # <ENDING> 375 , | }} 376 ) 377 ) 378 ) 379 ''') 380 381 382####################################### 383# statements 384 385SIMPLE_STMT_BODY = textwrap.dedent(rf''' 386 # simple statement body 387 (?: 388 (?: 389 [^'"{{}};]* 390 {_ind(STRING_LITERAL, 3)} 391 )* 392 [^'"{{}};]* 393 #(?= [;{{] ) # Note this lookahead. 394 ) 395 # end simple statement body 396 ''') 397SIMPLE_STMT = textwrap.dedent(rf''' 398 # simple statement 399 (?: 400 (?: # <SIMPLE_STMT> 401 # stmt-inline "initializer" 402 (?: 403 return \b 404 (?: 405 \s* 406 {_ind(INITIALIZER, 5)} 407 )? 408 ) 409 | 410 # variable assignment 411 (?: 412 (?: [*] \s* )? 413 (?: 414 {STRICT_IDENTIFIER} \s* 415 (?: . | -> ) \s* 416 )* 417 {STRICT_IDENTIFIER} 418 (?: \s* \[ \s* \d+ \s* \] )? 419 \s* = \s* 420 {_ind(INITIALIZER, 4)} 421 ) 422 | 423 # catchall return statement 424 (?: 425 return \b 426 (?: 427 (?: 428 [^'";]* 429 {_ind(STRING_LITERAL, 6)} 430 )* 431 \s* [^'";]* 432 )? 433 ) 434 | 435 # simple statement 436 (?: 437 {_ind(SIMPLE_STMT_BODY, 4)} 438 ) 439 ) 440 \s* 441 (?: # <SIMPLE_ENDING> 442 ; 443 ) 444 ) 445 # end simple statement 446 ''') 447COMPOUND_STMT = textwrap.dedent(rf''' 448 # compound statement 449 (?: 450 \b 451 (?: 452 (?: 453 (?: # <COMPOUND_BARE> 454 else | do 455 ) 456 \b 457 ) 458 | 459 (?: 460 (?: # <COMPOUND_LABELED> 461 (?: 462 case \b 463 (?: 464 [^'":]* 465 {_ind(STRING_LITERAL, 7)} 466 )* 467 \s* [^'":]* 468 ) 469 | 470 default 471 | 472 {STRICT_IDENTIFIER} 473 ) 474 \s* [:] 475 ) 476 | 477 (?: 478 (?: # <COMPOUND_PAREN> 479 for | while | if | switch 480 ) 481 \s* (?= [(] ) # Note this lookahead. 482 ) 483 ) 484 \s* 485 ) 486 # end compound statement 487 ''') 488 489 490####################################### 491# function bodies 492 493LOCAL = textwrap.dedent(rf''' 494 (?: 495 # an empty statement 496 (?: # <EMPTY> 497 ; 498 ) 499 | 500 # inline type decl 501 (?: 502 (?: 503 (?: # <INLINE_LEADING> 504 [^;{{}}]+? 505 ) 506 \s* 507 )? 508 (?: # <INLINE_PRE> 509 (?: {STORAGE_CLASS} \s* )? 510 (?: {TYPE_QUALIFIER} \s* )? 511 )? # </INLINE_PRE> 512 (?: # <INLINE_KIND> 513 {COMPOUND_TYPE_KIND} 514 ) 515 (?: 516 \s+ 517 (?: # <INLINE_NAME> 518 {STRICT_IDENTIFIER} 519 ) 520 )? 521 \s* {{ 522 ) 523 | 524 # var decl 525 (?: 526 (?: # <STORAGE> 527 {STORAGE_CLASS} 528 )? # </STORAGE> 529 (?: 530 \s* 531 (?: # <VAR_DECL> 532 {_ind(VAR_DECL, 5)} 533 ) 534 ) 535 (?: 536 (?: 537 # initializer 538 # We expect only basic initializers. 539 \s* = \s* 540 (?: # <VAR_INIT> 541 {_ind(INITIALIZER, 6)} 542 ) 543 )? 544 (?: 545 \s* 546 (?: # <VAR_ENDING> 547 [,;] 548 ) 549 ) 550 ) 551 ) 552 | 553 {_ind(COMPOUND_STMT, 2)} 554 | 555 # start-of-block 556 (?: 557 (?: # <BLOCK_LEADING> 558 (?: 559 [^'"{{}};]* 560 {_ind(STRING_LITERAL, 5)} 561 )* 562 [^'"{{}};]* 563 # Presumably we will not see "== {{". 564 [^\s='"{{}});] 565 \s* 566 )? # </BLOCK_LEADING> 567 (?: # <BLOCK_OPEN> 568 {{ 569 ) 570 ) 571 | 572 {_ind(SIMPLE_STMT, 2)} 573 | 574 # end-of-block 575 (?: # <BLOCK_CLOSE> 576 }} 577 ) 578 ) 579 ''') 580 581LOCAL_STATICS = textwrap.dedent(rf''' 582 (?: 583 # inline type decl 584 (?: 585 (?: 586 (?: # <INLINE_LEADING> 587 [^;{{}}]+? 588 ) 589 \s* 590 )? 591 (?: # <INLINE_PRE> 592 (?: {STORAGE_CLASS} \s* )? 593 (?: {TYPE_QUALIFIER} \s* )? 594 )? 595 (?: # <INLINE_KIND> 596 {COMPOUND_TYPE_KIND} 597 ) 598 (?: 599 \s+ 600 (?: # <INLINE_NAME> 601 {STRICT_IDENTIFIER} 602 ) 603 )? 604 \s* {{ 605 ) 606 | 607 # var decl 608 (?: 609 # We only look for static variables. 610 (?: # <STATIC_DECL> 611 static \b 612 (?: \s* {TYPE_QUALIFIER} )? 613 \s* {_ind(TYPE_SPEC, 4)} 614 \s* {_ind(DECLARATOR, 4)} 615 ) 616 \s* 617 (?: 618 (?: # <STATIC_INIT> 619 = \s* 620 {_ind(INITIALIZER, 4)} 621 \s* 622 [,;{{] 623 ) 624 | 625 (?: # <STATIC_ENDING> 626 [,;] 627 ) 628 ) 629 ) 630 | 631 # everything else 632 (?: 633 (?: # <DELIM_LEADING> 634 (?: 635 [^'"{{}};]* 636 {_ind(STRING_LITERAL, 4)} 637 )* 638 \s* [^'"{{}};]* 639 ) 640 (?: 641 (?: # <BLOCK_OPEN> 642 {{ 643 ) 644 | 645 (?: # <BLOCK_CLOSE> 646 }} 647 ) 648 | 649 (?: # <STMT_END> 650 ; 651 ) 652 ) 653 ) 654 ) 655 ''') 656 657 658####################################### 659# global declarations 660 661GLOBAL = textwrap.dedent(rf''' 662 (?: 663 # an empty statement 664 (?: # <EMPTY> 665 ; 666 ) 667 | 668 669 # compound type decl (maybe inline) 670 (?: 671 (?: 672 (?: # <COMPOUND_LEADING> 673 [^;{{}}]+? 674 ) 675 \s* 676 )? 677 (?: # <COMPOUND_KIND> 678 {COMPOUND_TYPE_KIND} 679 ) 680 (?: 681 \s+ 682 (?: # <COMPOUND_NAME> 683 {STRICT_IDENTIFIER} 684 ) 685 )? 686 \s* {{ 687 ) 688 | 689 # bogus inline decl artifact 690 # This simplifies resolving the relative syntactic ambiguity of 691 # inline structs. 692 (?: 693 (?: # <FORWARD_KIND> 694 {COMPOUND_TYPE_KIND} 695 ) 696 \s* 697 (?: # <FORWARD_NAME> 698 {ANON_IDENTIFIER} 699 ) 700 (?: # <MAYBE_INLINE_ACTUAL> 701 [^=,;({{[*\]]* 702 [=,;({{] 703 ) 704 ) 705 | 706 707 # typedef 708 (?: 709 \b typedef \b \s* 710 (?: # <TYPEDEF_DECL> 711 {_ind(VAR_DECL, 4)} 712 ) 713 (?: 714 # We expect no inline type definitions in the parameters. 715 \s* [(] \s* 716 (?: # <TYPEDEF_FUNC_PARAMS> 717 [^{{;]* 718 ) 719 \s* [)] 720 )? 721 \s* ; 722 ) 723 | 724 725 # func decl/definition & var decls 726 # XXX dedicated pattern for funcs (more restricted)? 727 (?: 728 (?: 729 (?: # <VAR_STORAGE> 730 {STORAGE_CLASS} 731 ) 732 \s* 733 )? 734 (?: 735 (?: # <FUNC_INLINE> 736 \b inline \b 737 ) 738 \s* 739 )? 740 (?: # <VAR_DECL> 741 {_ind(VAR_DECL, 4)} 742 ) 743 (?: 744 # func decl / definition 745 (?: 746 (?: 747 # We expect no inline type definitions in the parameters. 748 \s* [(] \s* 749 (?: # <FUNC_PARAMS> 750 [^{{;]* 751 ) 752 \s* [)] \s* 753 (?: # <FUNC_DELIM> 754 [{{;] 755 ) 756 ) 757 | 758 (?: 759 # This is some old-school syntax! 760 \s* [(] \s* 761 # We throw away the bare names: 762 {STRICT_IDENTIFIER} 763 (?: \s* , \s* {STRICT_IDENTIFIER} )* 764 \s* [)] \s* 765 766 # We keep the trailing param declarations: 767 (?: # <FUNC_LEGACY_PARAMS> 768 # There's at least one! 769 (?: {TYPE_QUALIFIER} \s* )? 770 {_ind(TYPE_SPEC, 7)} 771 \s* 772 {_ind(DECLARATOR, 7)} 773 \s* ; 774 (?: 775 \s* 776 (?: {TYPE_QUALIFIER} \s* )? 777 {_ind(TYPE_SPEC, 8)} 778 \s* 779 {_ind(DECLARATOR, 8)} 780 \s* ; 781 )* 782 ) 783 \s* {{ 784 ) 785 ) 786 | 787 # var / typedef 788 (?: 789 (?: 790 # initializer 791 # We expect only basic initializers. 792 \s* = \s* 793 (?: # <VAR_INIT> 794 {_ind(INITIALIZER, 6)} 795 ) 796 )? 797 \s* 798 (?: # <VAR_ENDING> 799 [,;] 800 ) 801 ) 802 ) 803 ) 804 ) 805 ''') 806