• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Regular expression patterns for C syntax.
2#
3# None of these patterns has any capturing.  However, a number of them
4# have capturing markers compatible with utils.set_capture_groups().
5
6import textwrap
7
8
9def _ind(text, level=1, edges='both'):
10    indent = '    ' * level
11    text = textwrap.indent(text, indent)
12    if edges == 'pre' or edges == 'both':
13        text = '\n' + indent + text.lstrip()
14    if edges == 'post' or edges == 'both':
15        text = text.rstrip() + '\n' + '    ' * (level - 1)
16    return text
17
18
19#######################################
20# general
21
22HEX = r'(?: [0-9a-zA-Z] )'
23
24STRING_LITERAL = textwrap.dedent(rf'''
25    (?:
26        # character literal
27        (?:
28            ['] [^'] [']
29            |
30            ['] \\ . [']
31            |
32            ['] \\x{HEX}{HEX} [']
33            |
34            ['] \\0\d\d [']
35            |
36            (?:
37                ['] \\o[01]\d\d [']
38                |
39                ['] \\o2[0-4]\d [']
40                |
41                ['] \\o25[0-5] [']
42             )
43         )
44        |
45        # string literal
46        (?:
47            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
48         )
49        # end string literal
50     )
51    ''')
52
53_KEYWORD = textwrap.dedent(r'''
54    (?:
55        \b
56        (?:
57            auto |
58            extern |
59            register |
60            static |
61            typedef |
62
63            const |
64            volatile |
65
66            signed |
67            unsigned |
68            char |
69            short |
70            int |
71            long |
72            float |
73            double |
74            void |
75
76            struct |
77            union |
78            enum |
79
80            goto |
81            return |
82            sizeof |
83            break |
84            continue |
85            if |
86            else |
87            for |
88            do |
89            while |
90            switch |
91            case |
92            default |
93            entry
94         )
95        \b
96     )
97    ''')
98KEYWORD = rf'''
99    # keyword
100    {_KEYWORD}
101    # end keyword
102    '''
103_KEYWORD = ''.join(_KEYWORD.split())
104
105IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
106# We use a negative lookahead to filter out keywords.
107STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
108ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
109
110
111#######################################
112# types
113
114SIMPLE_TYPE = textwrap.dedent(rf'''
115    # simple type
116    (?:
117        \b
118        (?:
119            void
120            |
121            (?: signed | unsigned )  # implies int
122            |
123            (?:
124                (?: (?: signed | unsigned ) \s+ )?
125                (?: (?: long | short ) \s+ )?
126                (?: char | short | int | long | float | double )
127             )
128         )
129        \b
130     )
131    # end simple type
132    ''')
133
134COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
135
136
137#######################################
138# variable declarations
139
140_STORAGE = 'auto register static extern'.split()
141STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
142TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
143PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
144
145TYPE_SPEC = textwrap.dedent(rf'''
146    # type spec
147    (?:
148        {_ind(SIMPLE_TYPE, 2)}
149        |
150        (?:
151            [_]*typeof[_]*
152            \s* [(]
153            (?: \s* [*&] )*
154            \s* {STRICT_IDENTIFIER}
155            \s* [)]
156         )
157        |
158        # reference to a compound type
159        (?:
160            {COMPOUND_TYPE_KIND}
161            (?: \s* {ANON_IDENTIFIER} )?
162         )
163        |
164        # reference to a typedef
165        {STRICT_IDENTIFIER}
166     )
167    # end type spec
168    ''')
169
170DECLARATOR = textwrap.dedent(rf'''
171    # declarator  (possibly abstract)
172    (?:
173        (?: {PTR_QUALIFIER} \s* )*
174        (?:
175            (?:
176                (?:  # <IDENTIFIER>
177                    {STRICT_IDENTIFIER}
178                )
179                # Inside the brackets is actually a "constant expression".
180                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
181             )
182            |
183            (?:
184                [(] \s*
185                (?:  # <WRAPPED_IDENTIFIER>
186                    {STRICT_IDENTIFIER}
187                )
188                # Inside the brackets is actually a "constant expression".
189                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
190                \s* [)]
191             )
192            |
193            # func ptr
194            (?:
195                [(] (?: \s* {PTR_QUALIFIER} )? \s*
196                (?:  # <FUNC_IDENTIFIER>
197                    {STRICT_IDENTIFIER}
198                )
199                # Inside the brackets is actually a "constant expression".
200                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
201                \s* [)]
202                # We allow for a single level of paren nesting in parameters.
203                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
204             )
205         )
206     )
207    # end declarator
208    ''')
209
210VAR_DECL = textwrap.dedent(rf'''
211    # var decl (and typedef and func return type)
212    (?:
213        (?:
214            (?:  # <STORAGE>
215                {STORAGE_CLASS}
216            )
217            \s*
218        )?
219        (?:
220            (?:  # <TYPE_QUAL>
221                {TYPE_QUALIFIER}
222            )
223            \s*
224         )?
225        (?:
226            (?:  # <TYPE_SPEC>
227                {_ind(TYPE_SPEC, 4)}
228            )
229         )
230        \s*
231        (?:
232            (?:  # <DECLARATOR>
233                {_ind(DECLARATOR, 4)}
234            )
235         )
236     )
237    # end var decl
238    ''')
239
240INITIALIZER = textwrap.dedent(rf'''
241    # initializer
242    (?:
243        (?:
244            [(]
245            # no nested parens (e.g. func ptr)
246            [^)]*
247            [)]
248            \s*
249         )?
250        (?:
251            # a string literal
252            (?:
253                (?: {_ind(STRING_LITERAL, 4)} \s* )*
254                {_ind(STRING_LITERAL, 4)}
255             )
256            |
257
258            # a simple initializer
259            (?:
260                (?:
261                    [^'",;{{]*
262                    {_ind(STRING_LITERAL, 4)}
263                 )*
264                [^'",;{{]*
265             )
266            |
267
268            # a struct/array literal
269            (?:
270                # We only expect compound initializers with
271                # single-variable declarations.
272                {{
273                (?:
274                    [^'";]*?
275                    {_ind(STRING_LITERAL, 5)}
276                 )*
277                [^'";]*?
278                }}
279                (?= \s* ; )  # Note this lookahead.
280             )
281         )
282     )
283    # end initializer
284    ''')
285
286
287#######################################
288# compound type declarations
289
290STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
291    (?:
292        # inline compound type decl
293        (?:
294            (?:  # <COMPOUND_TYPE_KIND>
295                {COMPOUND_TYPE_KIND}
296             )
297            (?:
298                \s+
299                (?:  # <COMPOUND_TYPE_NAME>
300                    {STRICT_IDENTIFIER}
301                 )
302             )?
303            \s* {{
304         )
305        |
306        (?:
307            # typed member
308            (?:
309                # Technically it doesn't have to have a type...
310                (?:  # <SPECIFIER_QUALIFIER>
311                    (?: {TYPE_QUALIFIER} \s* )?
312                    {_ind(TYPE_SPEC, 5)}
313                 )
314                (?:
315                    # If it doesn't have a declarator then it will have
316                    # a size and vice versa.
317                    \s*
318                    (?:  # <DECLARATOR>
319                        {_ind(DECLARATOR, 6)}
320                     )
321                 )?
322            )
323
324            # sized member
325            (?:
326                \s* [:] \s*
327                (?:  # <SIZE>
328                    # This is actually a "constant expression".
329                    \d+
330                    |
331                    [^'",}}]+
332                 )
333             )?
334            \s*
335            (?:  # <ENDING>
336                [,;]
337             )
338         )
339        |
340        (?:
341            \s*
342            (?:  # <CLOSE>
343                }}
344             )
345         )
346     )
347    ''')
348
349ENUM_MEMBER_DECL = textwrap.dedent(rf'''
350    (?:
351        (?:
352            \s*
353            (?:  # <CLOSE>
354                }}
355             )
356         )
357        |
358        (?:
359            \s*
360            (?:  # <NAME>
361                {IDENTIFIER}
362             )
363            (?:
364                \s* = \s*
365                (?:  # <INIT>
366                    # This is actually a "constant expression".
367                    {_ind(STRING_LITERAL, 4)}
368                    |
369                    [^'",}}]+
370                 )
371             )?
372            \s*
373            (?:  # <ENDING>
374                , | }}
375             )
376         )
377     )
378    ''')
379
380
381#######################################
382# statements
383
384SIMPLE_STMT_BODY = textwrap.dedent(rf'''
385    # simple statement body
386    (?:
387        (?:
388            [^'"{{}};]*
389            {_ind(STRING_LITERAL, 3)}
390         )*
391        [^'"{{}};]*
392        #(?= [;{{] )  # Note this lookahead.
393     )
394    # end simple statement body
395    ''')
396SIMPLE_STMT = textwrap.dedent(rf'''
397    # simple statement
398    (?:
399        (?:  # <SIMPLE_STMT>
400            # stmt-inline "initializer"
401            (?:
402                return \b
403                (?:
404                    \s*
405                    {_ind(INITIALIZER, 5)}
406                )?
407             )
408            |
409            # variable assignment
410            (?:
411                (?: [*] \s* )?
412                (?:
413                    {STRICT_IDENTIFIER} \s*
414                    (?: . | -> ) \s*
415                 )*
416                {STRICT_IDENTIFIER}
417                (?: \s* \[ \s* \d+ \s* \] )?
418                \s* = \s*
419                {_ind(INITIALIZER, 4)}
420             )
421            |
422            # catchall return statement
423            (?:
424                return \b
425                (?:
426                    (?:
427                        [^'";]*
428                        {_ind(STRING_LITERAL, 6)}
429                     )*
430                    \s* [^'";]*
431                 )?
432             )
433            |
434            # simple statement
435            (?:
436                {_ind(SIMPLE_STMT_BODY, 4)}
437             )
438         )
439        \s*
440        (?:  # <SIMPLE_ENDING>
441            ;
442         )
443     )
444    # end simple statement
445    ''')
446COMPOUND_STMT = textwrap.dedent(rf'''
447    # compound statement
448    (?:
449        \b
450        (?:
451            (?:
452                (?:  # <COMPOUND_BARE>
453                    else | do
454                 )
455                \b
456             )
457            |
458            (?:
459                (?:  # <COMPOUND_LABELED>
460                    (?:
461                        case \b
462                        (?:
463                            [^'":]*
464                            {_ind(STRING_LITERAL, 7)}
465                         )*
466                        \s* [^'":]*
467                     )
468                    |
469                    default
470                    |
471                    {STRICT_IDENTIFIER}
472                 )
473                \s* [:]
474             )
475            |
476            (?:
477                (?:  # <COMPOUND_PAREN>
478                    for | while | if | switch
479                 )
480                \s* (?= [(] )  # Note this lookahead.
481             )
482         )
483        \s*
484     )
485    # end compound statement
486    ''')
487
488
489#######################################
490# function bodies
491
492LOCAL = textwrap.dedent(rf'''
493    (?:
494        # an empty statement
495        (?:  # <EMPTY>
496            ;
497         )
498        |
499        # inline type decl
500        (?:
501            (?:
502                (?:  # <INLINE_LEADING>
503                    [^;{{}}]+?
504                 )
505                \s*
506             )?
507            (?:  # <INLINE_PRE>
508                (?: {STORAGE_CLASS} \s* )?
509                (?: {TYPE_QUALIFIER} \s* )?
510             )?  # </INLINE_PRE>
511            (?:  # <INLINE_KIND>
512                {COMPOUND_TYPE_KIND}
513             )
514            (?:
515                \s+
516                (?:  # <INLINE_NAME>
517                    {STRICT_IDENTIFIER}
518                 )
519             )?
520            \s* {{
521         )
522        |
523        # var decl
524        (?:
525            (?:  # <STORAGE>
526                {STORAGE_CLASS}
527             )?  # </STORAGE>
528            (?:
529                \s*
530                (?:  # <VAR_DECL>
531                    {_ind(VAR_DECL, 5)}
532                 )
533             )
534            (?:
535                (?:
536                    # initializer
537                    # We expect only basic initializers.
538                    \s* = \s*
539                    (?:  # <VAR_INIT>
540                        {_ind(INITIALIZER, 6)}
541                     )
542                 )?
543                (?:
544                    \s*
545                    (?:  # <VAR_ENDING>
546                        [,;]
547                     )
548                 )
549             )
550         )
551        |
552        {_ind(COMPOUND_STMT, 2)}
553        |
554        # start-of-block
555        (?:
556            (?:  # <BLOCK_LEADING>
557                (?:
558                    [^'"{{}};]*
559                    {_ind(STRING_LITERAL, 5)}
560                 )*
561                [^'"{{}};]*
562                # Presumably we will not see "== {{".
563                [^\s='"{{}});]
564                \s*
565             )?  # </BLOCK_LEADING>
566            (?:  # <BLOCK_OPEN>
567                {{
568             )
569         )
570        |
571        {_ind(SIMPLE_STMT, 2)}
572        |
573        # end-of-block
574        (?:  # <BLOCK_CLOSE>
575            }}
576         )
577     )
578    ''')
579
580LOCAL_STATICS = textwrap.dedent(rf'''
581    (?:
582        # inline type decl
583        (?:
584            (?:
585                (?:  # <INLINE_LEADING>
586                    [^;{{}}]+?
587                 )
588                \s*
589             )?
590            (?:  # <INLINE_PRE>
591                (?: {STORAGE_CLASS} \s* )?
592                (?: {TYPE_QUALIFIER} \s* )?
593             )?
594            (?:  # <INLINE_KIND>
595                {COMPOUND_TYPE_KIND}
596             )
597            (?:
598                \s+
599                (?:  # <INLINE_NAME>
600                    {STRICT_IDENTIFIER}
601                 )
602             )?
603            \s* {{
604         )
605        |
606        # var decl
607        (?:
608            # We only look for static variables.
609            (?:  # <STATIC_DECL>
610                static \b
611                (?: \s* {TYPE_QUALIFIER} )?
612                \s* {_ind(TYPE_SPEC, 4)}
613                \s* {_ind(DECLARATOR, 4)}
614             )
615            \s*
616            (?:
617                (?:  # <STATIC_INIT>
618                    = \s*
619                    {_ind(INITIALIZER, 4)}
620                    \s*
621                    [,;{{]
622                 )
623                |
624                (?:  # <STATIC_ENDING>
625                    [,;]
626                 )
627             )
628         )
629        |
630        # everything else
631        (?:
632            (?:  # <DELIM_LEADING>
633                (?:
634                    [^'"{{}};]*
635                    {_ind(STRING_LITERAL, 4)}
636                 )*
637                \s* [^'"{{}};]*
638             )
639            (?:
640                (?:  # <BLOCK_OPEN>
641                    {{
642                 )
643                |
644                (?:  # <BLOCK_CLOSE>
645                    }}
646                 )
647                |
648                (?:  # <STMT_END>
649                    ;
650                 )
651             )
652         )
653     )
654    ''')
655
656
657#######################################
658# global declarations
659
660GLOBAL = textwrap.dedent(rf'''
661    (?:
662        # an empty statement
663        (?:  # <EMPTY>
664            ;
665         )
666        |
667
668        # compound type decl (maybe inline)
669        (?:
670            (?:
671                (?:  # <COMPOUND_LEADING>
672                    [^;{{}}]+?
673                 )
674                 \s*
675             )?
676            (?:  # <COMPOUND_KIND>
677                {COMPOUND_TYPE_KIND}
678             )
679            (?:
680                \s+
681                (?:  # <COMPOUND_NAME>
682                    {STRICT_IDENTIFIER}
683                 )
684             )?
685            \s* {{
686         )
687        |
688        # bogus inline decl artifact
689        # This simplifies resolving the relative syntactic ambiguity of
690        # inline structs.
691        (?:
692            (?:  # <FORWARD_KIND>
693                {COMPOUND_TYPE_KIND}
694             )
695            \s*
696            (?:  # <FORWARD_NAME>
697                {ANON_IDENTIFIER}
698             )
699            (?:  # <MAYBE_INLINE_ACTUAL>
700                [^=,;({{[*\]]*
701                [=,;({{]
702             )
703         )
704        |
705
706        # typedef
707        (?:
708            \b typedef \b \s*
709            (?:  # <TYPEDEF_DECL>
710                {_ind(VAR_DECL, 4)}
711             )
712            (?:
713                # We expect no inline type definitions in the parameters.
714                \s* [(] \s*
715                (?:  # <TYPEDEF_FUNC_PARAMS>
716                    [^{{;]*
717                 )
718                \s* [)]
719             )?
720            \s* ;
721         )
722        |
723
724        # func decl/definition & var decls
725        # XXX dedicated pattern for funcs (more restricted)?
726        (?:
727            (?:
728                (?:  # <VAR_STORAGE>
729                    {STORAGE_CLASS}
730                 )
731                \s*
732             )?
733            (?:
734                (?:  # <FUNC_INLINE>
735                    \b inline \b
736                 )
737                \s*
738             )?
739            (?:  # <VAR_DECL>
740                {_ind(VAR_DECL, 4)}
741             )
742            (?:
743                # func decl / definition
744                (?:
745                    (?:
746                        # We expect no inline type definitions in the parameters.
747                        \s* [(] \s*
748                        (?:  # <FUNC_PARAMS>
749                            [^{{;]*
750                         )
751                        \s* [)] \s*
752                        (?:  # <FUNC_DELIM>
753                            [{{;]
754                         )
755                     )
756                    |
757                    (?:
758                        # This is some old-school syntax!
759                        \s* [(] \s*
760                        # We throw away the bare names:
761                        {STRICT_IDENTIFIER}
762                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
763                        \s* [)] \s*
764
765                        # We keep the trailing param declarations:
766                        (?:  # <FUNC_LEGACY_PARAMS>
767                            # There's at least one!
768                            (?: {TYPE_QUALIFIER} \s* )?
769                            {_ind(TYPE_SPEC, 7)}
770                            \s*
771                            {_ind(DECLARATOR, 7)}
772                            \s* ;
773                            (?:
774                                \s*
775                                (?: {TYPE_QUALIFIER} \s* )?
776                                {_ind(TYPE_SPEC, 8)}
777                                \s*
778                                {_ind(DECLARATOR, 8)}
779                                \s* ;
780                             )*
781                         )
782                        \s* {{
783                     )
784                 )
785                |
786                # var / typedef
787                (?:
788                    (?:
789                        # initializer
790                        # We expect only basic initializers.
791                        \s* = \s*
792                        (?:  # <VAR_INIT>
793                            {_ind(INITIALIZER, 6)}
794                         )
795                     )?
796                    \s*
797                    (?:  # <VAR_ENDING>
798                        [,;]
799                     )
800                 )
801             )
802         )
803     )
804    ''')
805