• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Parser for C code
2# Originally by Mark Shannon (mark@hotpy.org)
3# https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34
4
5import re
6from dataclasses import dataclass
7from collections.abc import Iterator
8
9
10def choice(*opts: str) -> str:
11    return "|".join("(%s)" % opt for opt in opts)
12
13
14# Regexes
15
16# Longer operators must go before shorter ones.
17
18PLUSPLUS = r"\+\+"
19MINUSMINUS = r"--"
20
21# ->
22ARROW = r"->"
23ELLIPSIS = r"\.\.\."
24
25# Assignment operators
26TIMESEQUAL = r"\*="
27DIVEQUAL = r"/="
28MODEQUAL = r"%="
29PLUSEQUAL = r"\+="
30MINUSEQUAL = r"-="
31LSHIFTEQUAL = r"<<="
32RSHIFTEQUAL = r">>="
33ANDEQUAL = r"&="
34OREQUAL = r"\|="
35XOREQUAL = r"\^="
36
37# Operators
38PLUS = r"\+"
39MINUS = r"-"
40TIMES = r"\*"
41DIVIDE = r"/"
42MOD = r"%"
43NOT = r"~"
44XOR = r"\^"
45LOR = r"\|\|"
46LAND = r"&&"
47LSHIFT = r"<<"
48RSHIFT = r">>"
49LE = r"<="
50GE = r">="
51EQ = r"=="
52NE = r"!="
53LT = r"<"
54GT = r">"
55LNOT = r"!"
56OR = r"\|"
57AND = r"&"
58EQUALS = r"="
59
60# ?
61CONDOP = r"\?"
62
63# Delimiters
64LPAREN = r"\("
65RPAREN = r"\)"
66LBRACKET = r"\["
67RBRACKET = r"\]"
68LBRACE = r"\{"
69RBRACE = r"\}"
70COMMA = r","
71PERIOD = r"\."
72SEMI = r";"
73COLON = r":"
74BACKSLASH = r"\\"
75
76operators = {op: pattern for op, pattern in globals().items() if op == op.upper()}
77for op in operators:
78    globals()[op] = op
79opmap = {pattern.replace("\\", "") or "\\": op for op, pattern in operators.items()}
80
81# Macros
82macro = r"# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)"
83CMACRO = "CMACRO"
84
85id_re = r"[a-zA-Z_][0-9a-zA-Z_]*"
86IDENTIFIER = "IDENTIFIER"
87
88
89suffix = r"([uU]?[lL]?[lL]?)"
90octal = r"0[0-7]+" + suffix
91hex = r"0[xX][0-9a-fA-F]+"
92decimal_digits = r"(0|[1-9][0-9]*)"
93decimal = decimal_digits + suffix
94
95
96exponent = r"""([eE][-+]?[0-9]+)"""
97fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
98float = "((((" + fraction + ")" + exponent + "?)|([0-9]+" + exponent + "))[FfLl]?)"
99
100number_re = choice(octal, hex, float, decimal)
101NUMBER = "NUMBER"
102
103simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
104decimal_escape = r"""(\d+)"""
105hex_escape = r"""(x[0-9a-fA-F]+)"""
106escape_sequence = (
107    r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
108)
109string_char = r"""([^"\\\n]|""" + escape_sequence + ")"
110str_re = '"' + string_char + '*"'
111STRING = "STRING"
112char = r"\'.\'"  # TODO: escape sequence
113CHARACTER = "CHARACTER"
114
115comment_re = r"(//.*)|/\*([^*]|\*[^/])*\*/"
116COMMENT = "COMMENT"
117
118newline = r"\n"
119invalid = (
120    r"\S"  # A single non-space character that's not caught by any of the other patterns
121)
122matcher = re.compile(
123    choice(
124        id_re,
125        number_re,
126        str_re,
127        char,
128        newline,
129        macro,
130        comment_re,
131        *operators.values(),
132        invalid,
133    )
134)
135letter = re.compile(r"[a-zA-Z_]")
136
137
138kwds = []
139AUTO = "AUTO"
140kwds.append(AUTO)
141BREAK = "BREAK"
142kwds.append(BREAK)
143CASE = "CASE"
144kwds.append(CASE)
145CHAR = "CHAR"
146kwds.append(CHAR)
147CONST = "CONST"
148kwds.append(CONST)
149CONTINUE = "CONTINUE"
150kwds.append(CONTINUE)
151DEFAULT = "DEFAULT"
152kwds.append(DEFAULT)
153DO = "DO"
154kwds.append(DO)
155DOUBLE = "DOUBLE"
156kwds.append(DOUBLE)
157ELSE = "ELSE"
158kwds.append(ELSE)
159ENUM = "ENUM"
160kwds.append(ENUM)
161EXTERN = "EXTERN"
162kwds.append(EXTERN)
163FLOAT = "FLOAT"
164kwds.append(FLOAT)
165FOR = "FOR"
166kwds.append(FOR)
167GOTO = "GOTO"
168kwds.append(GOTO)
169IF = "IF"
170kwds.append(IF)
171INLINE = "INLINE"
172kwds.append(INLINE)
173INT = "INT"
174kwds.append(INT)
175LONG = "LONG"
176kwds.append(LONG)
177OFFSETOF = "OFFSETOF"
178kwds.append(OFFSETOF)
179RESTRICT = "RESTRICT"
180kwds.append(RESTRICT)
181RETURN = "RETURN"
182kwds.append(RETURN)
183SHORT = "SHORT"
184kwds.append(SHORT)
185SIGNED = "SIGNED"
186kwds.append(SIGNED)
187SIZEOF = "SIZEOF"
188kwds.append(SIZEOF)
189STATIC = "STATIC"
190kwds.append(STATIC)
191STRUCT = "STRUCT"
192kwds.append(STRUCT)
193SWITCH = "SWITCH"
194kwds.append(SWITCH)
195TYPEDEF = "TYPEDEF"
196kwds.append(TYPEDEF)
197UNION = "UNION"
198kwds.append(UNION)
199UNSIGNED = "UNSIGNED"
200kwds.append(UNSIGNED)
201VOID = "VOID"
202kwds.append(VOID)
203VOLATILE = "VOLATILE"
204kwds.append(VOLATILE)
205WHILE = "WHILE"
206kwds.append(WHILE)
207# An instruction in the DSL
208INST = "INST"
209kwds.append(INST)
210# A micro-op in the DSL
211OP = "OP"
212kwds.append(OP)
213# A macro in the DSL
214MACRO = "MACRO"
215kwds.append(MACRO)
216keywords = {name.lower(): name for name in kwds}
217
218ANNOTATION = "ANNOTATION"
219annotations = {
220    "specializing",
221    "override",
222    "register",
223    "replaced",
224    "pure",
225    "split",
226    "replicate",
227    "tier1",
228    "tier2",
229}
230
231__all__ = []
232__all__.extend(kwds)
233
234
235def make_syntax_error(
236    message: str,
237    filename: str | None,
238    line: int,
239    column: int,
240    line_text: str,
241) -> SyntaxError:
242    return SyntaxError(message, (filename, line, column, line_text))
243
244
245@dataclass(slots=True)
246class Token:
247    filename: str
248    kind: str
249    text: str
250    begin: tuple[int, int]
251    end: tuple[int, int]
252
253    @property
254    def line(self) -> int:
255        return self.begin[0]
256
257    @property
258    def column(self) -> int:
259        return self.begin[1]
260
261    @property
262    def end_line(self) -> int:
263        return self.end[0]
264
265    @property
266    def end_column(self) -> int:
267        return self.end[1]
268
269    @property
270    def width(self) -> int:
271        return self.end[1] - self.begin[1]
272
273    def replaceText(self, txt: str) -> "Token":
274        assert isinstance(txt, str)
275        return Token(self.filename, self.kind, txt, self.begin, self.end)
276
277    def __repr__(self) -> str:
278        b0, b1 = self.begin
279        e0, e1 = self.end
280        if b0 == e0:
281            return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})"
282        else:
283            return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})"
284
285
286def tokenize(src: str, line: int = 1, filename: str = "") -> Iterator[Token]:
287    linestart = -1
288    for m in matcher.finditer(src):
289        start, end = m.span()
290        text = m.group(0)
291        if text in keywords:
292            kind = keywords[text]
293        elif text in annotations:
294            kind = ANNOTATION
295        elif letter.match(text):
296            kind = IDENTIFIER
297        elif text == "...":
298            kind = ELLIPSIS
299        elif text == ".":
300            kind = PERIOD
301        elif text[0] in "0123456789.":
302            kind = NUMBER
303        elif text[0] == '"':
304            kind = STRING
305        elif text in opmap:
306            kind = opmap[text]
307        elif text == "\n":
308            linestart = start
309            line += 1
310            kind = "\n"
311        elif text[0] == "'":
312            kind = CHARACTER
313        elif text[0] == "#":
314            kind = CMACRO
315        elif text[0] == "/" and text[1] in "/*":
316            kind = COMMENT
317        else:
318            lineend = src.find("\n", start)
319            if lineend == -1:
320                lineend = len(src)
321            raise make_syntax_error(
322                f"Bad token: {text}",
323                filename,
324                line,
325                start - linestart + 1,
326                src[linestart:lineend],
327            )
328        if kind == COMMENT:
329            begin = line, start - linestart
330            newlines = text.count("\n")
331            if newlines:
332                linestart = start + text.rfind("\n")
333                line += newlines
334        else:
335            begin = line, start - linestart
336        if kind != "\n":
337            yield Token(
338                filename, kind, text, begin, (line, start - linestart + len(text))
339            )
340
341
342def to_text(tkns: list[Token], dedent: int = 0) -> str:
343    res: list[str] = []
344    line, col = -1, 1 + dedent
345    for tkn in tkns:
346        if line == -1:
347            line, _ = tkn.begin
348        l, c = tkn.begin
349        # assert(l >= line), (line, txt, start, end)
350        while l > line:
351            line += 1
352            res.append("\n")
353            col = 1 + dedent
354        res.append(" " * (c - col))
355        text = tkn.text
356        if dedent != 0 and tkn.kind == "COMMENT" and "\n" in text:
357            if dedent < 0:
358                text = text.replace("\n", "\n" + " " * -dedent)
359            # TODO: dedent > 0
360        res.append(text)
361        line, col = tkn.end
362    return "".join(res)
363
364
365if __name__ == "__main__":
366    import sys
367
368    filename = sys.argv[1]
369    if filename == "-c":
370        src = sys.argv[2]
371    else:
372        src = open(filename).read()
373    # print(to_text(tokenize(src)))
374    for tkn in tokenize(src, filename=filename):
375        print(tkn)
376