• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Regular expression patterns for C syntax.
2#
3# None of these patterns has any capturing.  However, a number of them
4# have capturing markers compatible with utils.set_capture_groups().
5
6import textwrap
7
8
9def _ind(text, level=1, edges='both'):
10    indent = '    ' * level
11    text = textwrap.indent(text, indent)
12    if edges == 'pre' or edges == 'both':
13        text = '\n' + indent + text.lstrip()
14    if edges == 'post' or edges == 'both':
15        text = text.rstrip() + '\n' + '    ' * (level - 1)
16    return text
17
18
19#######################################
20# general
21
22HEX = r'(?: [0-9a-zA-Z] )'
23
24STRING_LITERAL = textwrap.dedent(rf'''
25    (?:
26        # character literal
27        (?:
28            ['] [^'] [']
29            |
30            ['] \\ . [']
31            |
32            ['] \\x{HEX}{HEX} [']
33            |
34            ['] \\0\d\d [']
35            |
36            (?:
37                ['] \\o[01]\d\d [']
38                |
39                ['] \\o2[0-4]\d [']
40                |
41                ['] \\o25[0-5] [']
42             )
43         )
44        |
45        # string literal
46        (?:
47            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
48         )
49        # end string literal
50     )
51    ''')
52
53_KEYWORD = textwrap.dedent(r'''
54    (?:
55        \b
56        (?:
57            auto |
58            extern |
59            register |
60            static |
61            typedef |
62
63            const |
64            volatile |
65
66            signed |
67            unsigned |
68            char |
69            short |
70            int |
71            long |
72            float |
73            double |
74            void |
75
76            struct |
77            union |
78            enum |
79
80            goto |
81            return |
82            sizeof |
83            break |
84            continue |
85            if |
86            else |
87            for |
88            do |
89            while |
90            switch |
91            case |
92            default |
93            entry
94         )
95        \b
96     )
97    ''')
98KEYWORD = rf'''
99    # keyword
100    {_KEYWORD}
101    # end keyword
102    '''
103_KEYWORD = ''.join(_KEYWORD.split())
104
105IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
106# We use a negative lookahead to filter out keywords.
107STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
108ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
109
110
111#######################################
112# types
113
114SIMPLE_TYPE = textwrap.dedent(rf'''
115    # simple type
116    (?:
117        \b
118        (?:
119            void
120            |
121            (?: signed | unsigned )  # implies int
122            |
123            (?:
124                (?: (?: signed | unsigned ) \s+ )?
125                (?: (?: long | short ) \s+ )?
126                (?: char | short | int | long | float | double )
127             )
128         )
129        \b
130     )
131    # end simple type
132    ''')
133
134COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
135
136
137#######################################
138# variable declarations
139
140_STORAGE = 'auto register static extern'.split()
141STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
142TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
143PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
144
145TYPE_SPEC = textwrap.dedent(rf'''
146    # type spec
147    (?:
148        {_ind(SIMPLE_TYPE, 2)}
149        |
150        (?:
151            [_]*typeof[_]*
152            \s* [(]
153            (?: \s* [*&] )*
154            \s* {STRICT_IDENTIFIER}
155            \s* [)]
156         )
157        |
158        # reference to a compound type
159        (?:
160            {COMPOUND_TYPE_KIND}
161            (?: \s* {ANON_IDENTIFIER} )?
162         )
163        |
164        # reference to a typedef
165        {STRICT_IDENTIFIER}
166     )
167    # end type spec
168    ''')
169
170DECLARATOR = textwrap.dedent(rf'''
171    # declarator  (possibly abstract)
172    (?:
173        (?: {PTR_QUALIFIER} \s* )*
174        (?:
175            (?:
176                (?:  # <IDENTIFIER>
177                    {STRICT_IDENTIFIER}
178                )
179                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
180             )
181            |
182            (?:
183                [(] \s*
184                (?:  # <WRAPPED_IDENTIFIER>
185                    {STRICT_IDENTIFIER}
186                )
187                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
188                \s* [)]
189             )
190            |
191            # func ptr
192            (?:
193                [(] (?: \s* {PTR_QUALIFIER} )? \s*
194                (?:  # <FUNC_IDENTIFIER>
195                    {STRICT_IDENTIFIER}
196                )
197                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
198                \s* [)]
199                # We allow for a single level of paren nesting in parameters.
200                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
201             )
202         )
203     )
204    # end declarator
205    ''')
206
207VAR_DECL = textwrap.dedent(rf'''
208    # var decl (and typedef and func return type)
209    (?:
210        (?:
211            (?:  # <STORAGE>
212                {STORAGE_CLASS}
213            )
214            \s*
215        )?
216        (?:
217            (?:  # <TYPE_QUAL>
218                {TYPE_QUALIFIER}
219            )
220            \s*
221         )?
222        (?:
223            (?:  # <TYPE_SPEC>
224                {_ind(TYPE_SPEC, 4)}
225            )
226         )
227        \s*
228        (?:
229            (?:  # <DECLARATOR>
230                {_ind(DECLARATOR, 4)}
231            )
232         )
233     )
234    # end var decl
235    ''')
236
237INITIALIZER = textwrap.dedent(rf'''
238    # initializer
239    (?:
240        (?:
241            [(]
242            # no nested parens (e.g. func ptr)
243            [^)]*
244            [)]
245            \s*
246         )?
247        (?:
248            # a string literal
249            (?:
250                (?: {_ind(STRING_LITERAL, 4)} \s* )*
251                {_ind(STRING_LITERAL, 4)}
252             )
253            |
254
255            # a simple initializer
256            (?:
257                (?:
258                    [^'",;{{]*
259                    {_ind(STRING_LITERAL, 4)}
260                 )*
261                [^'",;{{]*
262             )
263            |
264
265            # a struct/array literal
266            (?:
267                # We only expect compound initializers with
268                # single-variable declarations.
269                {{
270                (?:
271                    [^'";]*?
272                    {_ind(STRING_LITERAL, 5)}
273                 )*
274                [^'";]*?
275                }}
276                (?= \s* ; )  # Note this lookahead.
277             )
278         )
279     )
280    # end initializer
281    ''')
282
283
284#######################################
285# compound type declarations
286
287STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
288    (?:
289        # inline compound type decl
290        (?:
291            (?:  # <COMPOUND_TYPE_KIND>
292                {COMPOUND_TYPE_KIND}
293             )
294            (?:
295                \s+
296                (?:  # <COMPOUND_TYPE_NAME>
297                    {STRICT_IDENTIFIER}
298                 )
299             )?
300            \s* {{
301         )
302        |
303        (?:
304            # typed member
305            (?:
306                # Technically it doesn't have to have a type...
307                (?:  # <SPECIFIER_QUALIFIER>
308                    (?: {TYPE_QUALIFIER} \s* )?
309                    {_ind(TYPE_SPEC, 5)}
310                 )
311                (?:
312                    # If it doesn't have a declarator then it will have
313                    # a size and vice versa.
314                    \s*
315                    (?:  # <DECLARATOR>
316                        {_ind(DECLARATOR, 6)}
317                     )
318                 )?
319            )
320
321            # sized member
322            (?:
323                \s* [:] \s*
324                (?:  # <SIZE>
325                    \d+
326                 )
327             )?
328            \s*
329            (?:  # <ENDING>
330                [,;]
331             )
332         )
333        |
334        (?:
335            \s*
336            (?:  # <CLOSE>
337                }}
338             )
339         )
340     )
341    ''')
342
343ENUM_MEMBER_DECL = textwrap.dedent(rf'''
344    (?:
345        (?:
346            \s*
347            (?:  # <CLOSE>
348                }}
349             )
350         )
351        |
352        (?:
353            \s*
354            (?:  # <NAME>
355                {IDENTIFIER}
356             )
357            (?:
358                \s* = \s*
359                (?:  # <INIT>
360                    {_ind(STRING_LITERAL, 4)}
361                    |
362                    [^'",}}]+
363                 )
364             )?
365            \s*
366            (?:  # <ENDING>
367                , | }}
368             )
369         )
370     )
371    ''')
372
373
374#######################################
375# statements
376
377SIMPLE_STMT_BODY = textwrap.dedent(rf'''
378    # simple statement body
379    (?:
380        (?:
381            [^'"{{}};]*
382            {_ind(STRING_LITERAL, 3)}
383         )*
384        [^'"{{}};]*
385        #(?= [;{{] )  # Note this lookahead.
386     )
387    # end simple statement body
388    ''')
389SIMPLE_STMT = textwrap.dedent(rf'''
390    # simple statement
391    (?:
392        (?:  # <SIMPLE_STMT>
393            # stmt-inline "initializer"
394            (?:
395                return \b
396                (?:
397                    \s*
398                    {_ind(INITIALIZER, 5)}
399                )?
400             )
401            |
402            # variable assignment
403            (?:
404                (?: [*] \s* )?
405                (?:
406                    {STRICT_IDENTIFIER} \s*
407                    (?: . | -> ) \s*
408                 )*
409                {STRICT_IDENTIFIER}
410                (?: \s* \[ \s* \d+ \s* \] )?
411                \s* = \s*
412                {_ind(INITIALIZER, 4)}
413             )
414            |
415            # catchall return statement
416            (?:
417                return \b
418                (?:
419                    (?:
420                        [^'";]*
421                        {_ind(STRING_LITERAL, 6)}
422                     )*
423                    \s* [^'";]*
424                 )?
425             )
426            |
427            # simple statement
428            (?:
429                {_ind(SIMPLE_STMT_BODY, 4)}
430             )
431         )
432        \s*
433        (?:  # <SIMPLE_ENDING>
434            ;
435         )
436     )
437    # end simple statement
438    ''')
439COMPOUND_STMT = textwrap.dedent(rf'''
440    # compound statement
441    (?:
442        \b
443        (?:
444            (?:
445                (?:  # <COMPOUND_BARE>
446                    else | do
447                 )
448                \b
449             )
450            |
451            (?:
452                (?:  # <COMPOUND_LABELED>
453                    (?:
454                        case \b
455                        (?:
456                            [^'":]*
457                            {_ind(STRING_LITERAL, 7)}
458                         )*
459                        \s* [^'":]*
460                     )
461                    |
462                    default
463                    |
464                    {STRICT_IDENTIFIER}
465                 )
466                \s* [:]
467             )
468            |
469            (?:
470                (?:  # <COMPOUND_PAREN>
471                    for | while | if | switch
472                 )
473                \s* (?= [(] )  # Note this lookahead.
474             )
475         )
476        \s*
477     )
478    # end compound statement
479    ''')
480
481
482#######################################
483# function bodies
484
485LOCAL = textwrap.dedent(rf'''
486    (?:
487        # an empty statement
488        (?:  # <EMPTY>
489            ;
490         )
491        |
492        # inline type decl
493        (?:
494            (?:
495                (?:  # <INLINE_LEADING>
496                    [^;{{}}]+?
497                 )
498                \s*
499             )?
500            (?:  # <INLINE_PRE>
501                (?: {STORAGE_CLASS} \s* )?
502                (?: {TYPE_QUALIFIER} \s* )?
503             )?  # </INLINE_PRE>
504            (?:  # <INLINE_KIND>
505                {COMPOUND_TYPE_KIND}
506             )
507            (?:
508                \s+
509                (?:  # <INLINE_NAME>
510                    {STRICT_IDENTIFIER}
511                 )
512             )?
513            \s* {{
514         )
515        |
516        # var decl
517        (?:
518            (?:  # <STORAGE>
519                {STORAGE_CLASS}
520             )?  # </STORAGE>
521            (?:
522                \s*
523                (?:  # <VAR_DECL>
524                    {_ind(VAR_DECL, 5)}
525                 )
526             )
527            (?:
528                (?:
529                    # initializer
530                    # We expect only basic initializers.
531                    \s* = \s*
532                    (?:  # <VAR_INIT>
533                        {_ind(INITIALIZER, 6)}
534                     )
535                 )?
536                (?:
537                    \s*
538                    (?:  # <VAR_ENDING>
539                        [,;]
540                     )
541                 )
542             )
543         )
544        |
545        {_ind(COMPOUND_STMT, 2)}
546        |
547        # start-of-block
548        (?:
549            (?:  # <BLOCK_LEADING>
550                (?:
551                    [^'"{{}};]*
552                    {_ind(STRING_LITERAL, 5)}
553                 )*
554                [^'"{{}};]*
555                # Presumably we will not see "== {{".
556                [^\s='"{{}});]
557                \s*
558             )?  # </BLOCK_LEADING>
559            (?:  # <BLOCK_OPEN>
560                {{
561             )
562         )
563        |
564        {_ind(SIMPLE_STMT, 2)}
565        |
566        # end-of-block
567        (?:  # <BLOCK_CLOSE>
568            }}
569         )
570     )
571    ''')
572
573LOCAL_STATICS = textwrap.dedent(rf'''
574    (?:
575        # inline type decl
576        (?:
577            (?:
578                (?:  # <INLINE_LEADING>
579                    [^;{{}}]+?
580                 )
581                \s*
582             )?
583            (?:  # <INLINE_PRE>
584                (?: {STORAGE_CLASS} \s* )?
585                (?: {TYPE_QUALIFIER} \s* )?
586             )?
587            (?:  # <INLINE_KIND>
588                {COMPOUND_TYPE_KIND}
589             )
590            (?:
591                \s+
592                (?:  # <INLINE_NAME>
593                    {STRICT_IDENTIFIER}
594                 )
595             )?
596            \s* {{
597         )
598        |
599        # var decl
600        (?:
601            # We only look for static variables.
602            (?:  # <STATIC_DECL>
603                static \b
604                (?: \s* {TYPE_QUALIFIER} )?
605                \s* {_ind(TYPE_SPEC, 4)}
606                \s* {_ind(DECLARATOR, 4)}
607             )
608            \s*
609            (?:
610                (?:  # <STATIC_INIT>
611                    = \s*
612                    {_ind(INITIALIZER, 4)}
613                    \s*
614                    [,;{{]
615                 )
616                |
617                (?:  # <STATIC_ENDING>
618                    [,;]
619                 )
620             )
621         )
622        |
623        # everything else
624        (?:
625            (?:  # <DELIM_LEADING>
626                (?:
627                    [^'"{{}};]*
628                    {_ind(STRING_LITERAL, 4)}
629                 )*
630                \s* [^'"{{}};]*
631             )
632            (?:
633                (?:  # <BLOCK_OPEN>
634                    {{
635                 )
636                |
637                (?:  # <BLOCK_CLOSE>
638                    }}
639                 )
640                |
641                (?:  # <STMT_END>
642                    ;
643                 )
644             )
645         )
646     )
647    ''')
648
649
650#######################################
651# global declarations
652
653GLOBAL = textwrap.dedent(rf'''
654    (?:
655        # an empty statement
656        (?:  # <EMPTY>
657            ;
658         )
659        |
660
661        # compound type decl (maybe inline)
662        (?:
663            (?:
664                (?:  # <COMPOUND_LEADING>
665                    [^;{{}}]+?
666                 )
667                 \s*
668             )?
669            (?:  # <COMPOUND_KIND>
670                {COMPOUND_TYPE_KIND}
671             )
672            (?:
673                \s+
674                (?:  # <COMPOUND_NAME>
675                    {STRICT_IDENTIFIER}
676                 )
677             )?
678            \s* {{
679         )
680        |
681        # bogus inline decl artifact
682        # This simplifies resolving the relative syntactic ambiguity of
683        # inline structs.
684        (?:
685            (?:  # <FORWARD_KIND>
686                {COMPOUND_TYPE_KIND}
687             )
688            \s*
689            (?:  # <FORWARD_NAME>
690                {ANON_IDENTIFIER}
691             )
692            (?:  # <MAYBE_INLINE_ACTUAL>
693                [^=,;({{[*\]]*
694                [=,;({{]
695             )
696         )
697        |
698
699        # typedef
700        (?:
701            \b typedef \b \s*
702            (?:  # <TYPEDEF_DECL>
703                {_ind(VAR_DECL, 4)}
704             )
705            (?:
706                # We expect no inline type definitions in the parameters.
707                \s* [(] \s*
708                (?:  # <TYPEDEF_FUNC_PARAMS>
709                    [^{{;]*
710                 )
711                \s* [)]
712             )?
713            \s* ;
714         )
715        |
716
717        # func decl/definition & var decls
718        # XXX dedicated pattern for funcs (more restricted)?
719        (?:
720            (?:
721                (?:  # <VAR_STORAGE>
722                    {STORAGE_CLASS}
723                 )
724                \s*
725             )?
726            (?:
727                (?:  # <FUNC_INLINE>
728                    \b inline \b
729                 )
730                \s*
731             )?
732            (?:  # <VAR_DECL>
733                {_ind(VAR_DECL, 4)}
734             )
735            (?:
736                # func decl / definition
737                (?:
738                    (?:
739                        # We expect no inline type definitions in the parameters.
740                        \s* [(] \s*
741                        (?:  # <FUNC_PARAMS>
742                            [^{{;]*
743                         )
744                        \s* [)] \s*
745                        (?:  # <FUNC_DELIM>
746                            [{{;]
747                         )
748                     )
749                    |
750                    (?:
751                        # This is some old-school syntax!
752                        \s* [(] \s*
753                        # We throw away the bare names:
754                        {STRICT_IDENTIFIER}
755                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
756                        \s* [)] \s*
757
758                        # We keep the trailing param declarations:
759                        (?:  # <FUNC_LEGACY_PARAMS>
760                            # There's at least one!
761                            (?: {TYPE_QUALIFIER} \s* )?
762                            {_ind(TYPE_SPEC, 7)}
763                            \s*
764                            {_ind(DECLARATOR, 7)}
765                            \s* ;
766                            (?:
767                                \s*
768                                (?: {TYPE_QUALIFIER} \s* )?
769                                {_ind(TYPE_SPEC, 8)}
770                                \s*
771                                {_ind(DECLARATOR, 8)}
772                                \s* ;
773                             )*
774                         )
775                        \s* {{
776                     )
777                 )
778                |
779                # var / typedef
780                (?:
781                    (?:
782                        # initializer
783                        # We expect only basic initializers.
784                        \s* = \s*
785                        (?:  # <VAR_INIT>
786                            {_ind(INITIALIZER, 6)}
787                         )
788                     )?
789                    \s*
790                    (?:  # <VAR_ENDING>
791                        [,;]
792                     )
793                 )
794             )
795         )
796     )
797    ''')
798