• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Regular expression patterns for C syntax.
2#
3# None of these patterns has any capturing.  However, a number of them
4# have capturing markers compatible with utils.set_capture_groups().
5
6import textwrap
7
8
9def _ind(text, level=1, edges='both'):
10    indent = '    ' * level
11    text = textwrap.indent(text, indent)
12    if edges == 'pre' or edges == 'both':
13        text = '\n' + indent + text.lstrip()
14    if edges == 'post' or edges == 'both':
15        text = text.rstrip() + '\n' + '    ' * (level - 1)
16    return text
17
18
19#######################################
20# general
21
22HEX = r'(?: [0-9a-zA-Z] )'
23
24STRING_LITERAL = textwrap.dedent(rf'''
25    (?:
26        # character literal
27        (?:
28            ['] [^'] [']
29            |
30            ['] \\ . [']
31            |
32            ['] \\x{HEX}{HEX} [']
33            |
34            ['] \\0\d\d [']
35            |
36            (?:
37                ['] \\o[01]\d\d [']
38                |
39                ['] \\o2[0-4]\d [']
40                |
41                ['] \\o25[0-5] [']
42             )
43         )
44        |
45        # string literal
46        (?:
47            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
48         )
49        # end string literal
50     )
51    ''')
52
53_KEYWORD = textwrap.dedent(r'''
54    (?:
55        \b
56        (?:
57            auto |
58            extern |
59            register |
60            static |
61            _Thread_local |
62            typedef |
63
64            const |
65            volatile |
66
67            signed |
68            unsigned |
69            char |
70            short |
71            int |
72            long |
73            float |
74            double |
75            void |
76
77            struct |
78            union |
79            enum |
80
81            goto |
82            return |
83            sizeof |
84            break |
85            continue |
86            if |
87            else |
88            for |
89            do |
90            while |
91            switch |
92            case |
93            default |
94            entry
95         )
96        \b
97     )
98    ''')
99KEYWORD = rf'''
100    # keyword
101    {_KEYWORD}
102    # end keyword
103    '''
104_KEYWORD = ''.join(_KEYWORD.split())
105
106IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
107# We use a negative lookahead to filter out keywords.
108STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
109ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
110
111
112#######################################
113# types
114
115SIMPLE_TYPE = textwrap.dedent(rf'''
116    # simple type
117    (?:
118        \b
119        (?:
120            void
121            |
122            (?: signed | unsigned )  # implies int
123            |
124            (?:
125                (?: (?: signed | unsigned ) \s+ )?
126                (?: (?: long | short ) \s+ )?
127                (?: char | short | int | long | float | double )
128             )
129         )
130        \b
131     )
132    # end simple type
133    ''')
134
135COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
136
137
138#######################################
139# variable declarations
140
141_STORAGE = 'auto register static extern _Thread_local'.split()
142STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
143TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
144PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
145
146TYPE_SPEC = textwrap.dedent(rf'''
147    # type spec
148    (?:
149        {_ind(SIMPLE_TYPE, 2)}
150        |
151        (?:
152            [_]*typeof[_]*
153            \s* [(]
154            (?: \s* [*&] )*
155            \s* {STRICT_IDENTIFIER}
156            \s* [)]
157         )
158        |
159        # reference to a compound type
160        (?:
161            {COMPOUND_TYPE_KIND}
162            (?: \s* {ANON_IDENTIFIER} )?
163         )
164        |
165        # reference to a typedef
166        {STRICT_IDENTIFIER}
167     )
168    # end type spec
169    ''')
170
171DECLARATOR = textwrap.dedent(rf'''
172    # declarator  (possibly abstract)
173    (?:
174        (?: {PTR_QUALIFIER} \s* )*
175        (?:
176            (?:
177                (?:  # <IDENTIFIER>
178                    {STRICT_IDENTIFIER}
179                )
180                # Inside the brackets is actually a "constant expression".
181                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
182             )
183            |
184            (?:
185                [(] \s*
186                (?:  # <WRAPPED_IDENTIFIER>
187                    {STRICT_IDENTIFIER}
188                )
189                # Inside the brackets is actually a "constant expression".
190                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
191                \s* [)]
192             )
193            |
194            # func ptr
195            (?:
196                [(] (?: \s* {PTR_QUALIFIER} )? \s*
197                (?:  # <FUNC_IDENTIFIER>
198                    {STRICT_IDENTIFIER}
199                )
200                # Inside the brackets is actually a "constant expression".
201                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
202                \s* [)]
203                # We allow for a single level of paren nesting in parameters.
204                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
205             )
206         )
207     )
208    # end declarator
209    ''')
210
211VAR_DECL = textwrap.dedent(rf'''
212    # var decl (and typedef and func return type)
213    (?:
214        (?:
215            (?:  # <STORAGE>
216                {STORAGE_CLASS}
217            )
218            \s*
219        )?
220        (?:
221            (?:  # <TYPE_QUAL>
222                {TYPE_QUALIFIER}
223            )
224            \s*
225         )?
226        (?:
227            (?:  # <TYPE_SPEC>
228                {_ind(TYPE_SPEC, 4)}
229            )
230         )
231        \s*
232        (?:
233            (?:  # <DECLARATOR>
234                {_ind(DECLARATOR, 4)}
235            )
236         )
237     )
238    # end var decl
239    ''')
240
241INITIALIZER = textwrap.dedent(rf'''
242    # initializer
243    (?:
244        (?:
245            [(]
246            # no nested parens (e.g. func ptr)
247            [^)]*
248            [)]
249            \s*
250         )?
251        (?:
252            # a string literal
253            (?:
254                (?: {_ind(STRING_LITERAL, 4)} \s* )*
255                {_ind(STRING_LITERAL, 4)}
256             )
257            |
258
259            # a simple initializer
260            (?:
261                (?:
262                    [^'",;{{]*
263                    {_ind(STRING_LITERAL, 4)}
264                 )*
265                [^'",;{{]*
266             )
267            |
268
269            # a struct/array literal
270            (?:
271                # We only expect compound initializers with
272                # single-variable declarations.
273                {{
274                (?:
275                    [^'";]*?
276                    {_ind(STRING_LITERAL, 5)}
277                 )*
278                [^'";]*?
279                }}
280                (?= \s* ; )  # Note this lookahead.
281             )
282         )
283     )
284    # end initializer
285    ''')
286
287
288#######################################
289# compound type declarations
290
291STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
292    (?:
293        # inline compound type decl
294        (?:
295            (?:  # <COMPOUND_TYPE_KIND>
296                {COMPOUND_TYPE_KIND}
297             )
298            (?:
299                \s+
300                (?:  # <COMPOUND_TYPE_NAME>
301                    {STRICT_IDENTIFIER}
302                 )
303             )?
304            \s* {{
305         )
306        |
307        (?:
308            # typed member
309            (?:
310                # Technically it doesn't have to have a type...
311                (?:  # <SPECIFIER_QUALIFIER>
312                    (?: {TYPE_QUALIFIER} \s* )?
313                    {_ind(TYPE_SPEC, 5)}
314                 )
315                (?:
316                    # If it doesn't have a declarator then it will have
317                    # a size and vice versa.
318                    \s*
319                    (?:  # <DECLARATOR>
320                        {_ind(DECLARATOR, 6)}
321                     )
322                 )?
323            )
324
325            # sized member
326            (?:
327                \s* [:] \s*
328                (?:  # <SIZE>
329                    # This is actually a "constant expression".
330                    \d+
331                    |
332                    [^'",}}]+
333                 )
334             )?
335            \s*
336            (?:  # <ENDING>
337                [,;]
338             )
339         )
340        |
341        (?:
342            \s*
343            (?:  # <CLOSE>
344                }}
345             )
346         )
347     )
348    ''')
349
350ENUM_MEMBER_DECL = textwrap.dedent(rf'''
351    (?:
352        (?:
353            \s*
354            (?:  # <CLOSE>
355                }}
356             )
357         )
358        |
359        (?:
360            \s*
361            (?:  # <NAME>
362                {IDENTIFIER}
363             )
364            (?:
365                \s* = \s*
366                (?:  # <INIT>
367                    # This is actually a "constant expression".
368                    {_ind(STRING_LITERAL, 4)}
369                    |
370                    [^'",}}]+
371                 )
372             )?
373            \s*
374            (?:  # <ENDING>
375                , | }}
376             )
377         )
378     )
379    ''')
380
381
382#######################################
383# statements
384
385SIMPLE_STMT_BODY = textwrap.dedent(rf'''
386    # simple statement body
387    (?:
388        (?:
389            [^'"{{}};]*
390            {_ind(STRING_LITERAL, 3)}
391         )*
392        [^'"{{}};]*
393        #(?= [;{{] )  # Note this lookahead.
394     )
395    # end simple statement body
396    ''')
397SIMPLE_STMT = textwrap.dedent(rf'''
398    # simple statement
399    (?:
400        (?:  # <SIMPLE_STMT>
401            # stmt-inline "initializer"
402            (?:
403                return \b
404                (?:
405                    \s*
406                    {_ind(INITIALIZER, 5)}
407                )?
408             )
409            |
410            # variable assignment
411            (?:
412                (?: [*] \s* )?
413                (?:
414                    {STRICT_IDENTIFIER} \s*
415                    (?: . | -> ) \s*
416                 )*
417                {STRICT_IDENTIFIER}
418                (?: \s* \[ \s* \d+ \s* \] )?
419                \s* = \s*
420                {_ind(INITIALIZER, 4)}
421             )
422            |
423            # catchall return statement
424            (?:
425                return \b
426                (?:
427                    (?:
428                        [^'";]*
429                        {_ind(STRING_LITERAL, 6)}
430                     )*
431                    \s* [^'";]*
432                 )?
433             )
434            |
435            # simple statement
436            (?:
437                {_ind(SIMPLE_STMT_BODY, 4)}
438             )
439         )
440        \s*
441        (?:  # <SIMPLE_ENDING>
442            ;
443         )
444     )
445    # end simple statement
446    ''')
447COMPOUND_STMT = textwrap.dedent(rf'''
448    # compound statement
449    (?:
450        \b
451        (?:
452            (?:
453                (?:  # <COMPOUND_BARE>
454                    else | do
455                 )
456                \b
457             )
458            |
459            (?:
460                (?:  # <COMPOUND_LABELED>
461                    (?:
462                        case \b
463                        (?:
464                            [^'":]*
465                            {_ind(STRING_LITERAL, 7)}
466                         )*
467                        \s* [^'":]*
468                     )
469                    |
470                    default
471                    |
472                    {STRICT_IDENTIFIER}
473                 )
474                \s* [:]
475             )
476            |
477            (?:
478                (?:  # <COMPOUND_PAREN>
479                    for | while | if | switch
480                 )
481                \s* (?= [(] )  # Note this lookahead.
482             )
483         )
484        \s*
485     )
486    # end compound statement
487    ''')
488
489
490#######################################
491# function bodies
492
493LOCAL = textwrap.dedent(rf'''
494    (?:
495        # an empty statement
496        (?:  # <EMPTY>
497            ;
498         )
499        |
500        # inline type decl
501        (?:
502            (?:
503                (?:  # <INLINE_LEADING>
504                    [^;{{}}]+?
505                 )
506                \s*
507             )?
508            (?:  # <INLINE_PRE>
509                (?: {STORAGE_CLASS} \s* )?
510                (?: {TYPE_QUALIFIER} \s* )?
511             )?  # </INLINE_PRE>
512            (?:  # <INLINE_KIND>
513                {COMPOUND_TYPE_KIND}
514             )
515            (?:
516                \s+
517                (?:  # <INLINE_NAME>
518                    {STRICT_IDENTIFIER}
519                 )
520             )?
521            \s* {{
522         )
523        |
524        # var decl
525        (?:
526            (?:  # <STORAGE>
527                {STORAGE_CLASS}
528             )?  # </STORAGE>
529            (?:
530                \s*
531                (?:  # <VAR_DECL>
532                    {_ind(VAR_DECL, 5)}
533                 )
534             )
535            (?:
536                (?:
537                    # initializer
538                    # We expect only basic initializers.
539                    \s* = \s*
540                    (?:  # <VAR_INIT>
541                        {_ind(INITIALIZER, 6)}
542                     )
543                 )?
544                (?:
545                    \s*
546                    (?:  # <VAR_ENDING>
547                        [,;]
548                     )
549                 )
550             )
551         )
552        |
553        {_ind(COMPOUND_STMT, 2)}
554        |
555        # start-of-block
556        (?:
557            (?:  # <BLOCK_LEADING>
558                (?:
559                    [^'"{{}};]*
560                    {_ind(STRING_LITERAL, 5)}
561                 )*
562                [^'"{{}};]*
563                # Presumably we will not see "== {{".
564                [^\s='"{{}});]
565                \s*
566             )?  # </BLOCK_LEADING>
567            (?:  # <BLOCK_OPEN>
568                {{
569             )
570         )
571        |
572        {_ind(SIMPLE_STMT, 2)}
573        |
574        # end-of-block
575        (?:  # <BLOCK_CLOSE>
576            }}
577         )
578     )
579    ''')
580
581LOCAL_STATICS = textwrap.dedent(rf'''
582    (?:
583        # inline type decl
584        (?:
585            (?:
586                (?:  # <INLINE_LEADING>
587                    [^;{{}}]+?
588                 )
589                \s*
590             )?
591            (?:  # <INLINE_PRE>
592                (?: {STORAGE_CLASS} \s* )?
593                (?: {TYPE_QUALIFIER} \s* )?
594             )?
595            (?:  # <INLINE_KIND>
596                {COMPOUND_TYPE_KIND}
597             )
598            (?:
599                \s+
600                (?:  # <INLINE_NAME>
601                    {STRICT_IDENTIFIER}
602                 )
603             )?
604            \s* {{
605         )
606        |
607        # var decl
608        (?:
609            # We only look for static variables.
610            (?:  # <STATIC_DECL>
611                static \b
612                (?: \s* {TYPE_QUALIFIER} )?
613                \s* {_ind(TYPE_SPEC, 4)}
614                \s* {_ind(DECLARATOR, 4)}
615             )
616            \s*
617            (?:
618                (?:  # <STATIC_INIT>
619                    = \s*
620                    {_ind(INITIALIZER, 4)}
621                    \s*
622                    [,;{{]
623                 )
624                |
625                (?:  # <STATIC_ENDING>
626                    [,;]
627                 )
628             )
629         )
630        |
631        # everything else
632        (?:
633            (?:  # <DELIM_LEADING>
634                (?:
635                    [^'"{{}};]*
636                    {_ind(STRING_LITERAL, 4)}
637                 )*
638                \s* [^'"{{}};]*
639             )
640            (?:
641                (?:  # <BLOCK_OPEN>
642                    {{
643                 )
644                |
645                (?:  # <BLOCK_CLOSE>
646                    }}
647                 )
648                |
649                (?:  # <STMT_END>
650                    ;
651                 )
652             )
653         )
654     )
655    ''')
656
657
658#######################################
659# global declarations
660
661GLOBAL = textwrap.dedent(rf'''
662    (?:
663        # an empty statement
664        (?:  # <EMPTY>
665            ;
666         )
667        |
668
669        # compound type decl (maybe inline)
670        (?:
671            (?:
672                (?:  # <COMPOUND_LEADING>
673                    [^;{{}}]+?
674                 )
675                 \s*
676             )?
677            (?:  # <COMPOUND_KIND>
678                {COMPOUND_TYPE_KIND}
679             )
680            (?:
681                \s+
682                (?:  # <COMPOUND_NAME>
683                    {STRICT_IDENTIFIER}
684                 )
685             )?
686            \s* {{
687         )
688        |
689        # bogus inline decl artifact
690        # This simplifies resolving the relative syntactic ambiguity of
691        # inline structs.
692        (?:
693            (?:  # <FORWARD_KIND>
694                {COMPOUND_TYPE_KIND}
695             )
696            \s*
697            (?:  # <FORWARD_NAME>
698                {ANON_IDENTIFIER}
699             )
700            (?:  # <MAYBE_INLINE_ACTUAL>
701                [^=,;({{[*\]]*
702                [=,;({{]
703             )
704         )
705        |
706
707        # typedef
708        (?:
709            \b typedef \b \s*
710            (?:  # <TYPEDEF_DECL>
711                {_ind(VAR_DECL, 4)}
712             )
713            (?:
714                # We expect no inline type definitions in the parameters.
715                \s* [(] \s*
716                (?:  # <TYPEDEF_FUNC_PARAMS>
717                    [^{{;]*
718                 )
719                \s* [)]
720             )?
721            \s* ;
722         )
723        |
724
725        # func decl/definition & var decls
726        # XXX dedicated pattern for funcs (more restricted)?
727        (?:
728            (?:
729                (?:  # <VAR_STORAGE>
730                    {STORAGE_CLASS}
731                 )
732                \s*
733             )?
734            (?:
735                (?:  # <FUNC_INLINE>
736                    \b inline \b
737                 )
738                \s*
739             )?
740            (?:  # <VAR_DECL>
741                {_ind(VAR_DECL, 4)}
742             )
743            (?:
744                # func decl / definition
745                (?:
746                    (?:
747                        # We expect no inline type definitions in the parameters.
748                        \s* [(] \s*
749                        (?:  # <FUNC_PARAMS>
750                            [^{{;]*
751                         )
752                        \s* [)] \s*
753                        (?:  # <FUNC_DELIM>
754                            [{{;]
755                         )
756                     )
757                    |
758                    (?:
759                        # This is some old-school syntax!
760                        \s* [(] \s*
761                        # We throw away the bare names:
762                        {STRICT_IDENTIFIER}
763                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
764                        \s* [)] \s*
765
766                        # We keep the trailing param declarations:
767                        (?:  # <FUNC_LEGACY_PARAMS>
768                            # There's at least one!
769                            (?: {TYPE_QUALIFIER} \s* )?
770                            {_ind(TYPE_SPEC, 7)}
771                            \s*
772                            {_ind(DECLARATOR, 7)}
773                            \s* ;
774                            (?:
775                                \s*
776                                (?: {TYPE_QUALIFIER} \s* )?
777                                {_ind(TYPE_SPEC, 8)}
778                                \s*
779                                {_ind(DECLARATOR, 8)}
780                                \s* ;
781                             )*
782                         )
783                        \s* {{
784                     )
785                 )
786                |
787                # var / typedef
788                (?:
789                    (?:
790                        # initializer
791                        # We expect only basic initializers.
792                        \s* = \s*
793                        (?:  # <VAR_INIT>
794                            {_ind(INITIALIZER, 6)}
795                         )
796                     )?
797                    \s*
798                    (?:  # <VAR_ENDING>
799                        [,;]
800                     )
801                 )
802             )
803         )
804     )
805    ''')
806