1# Parser for C code 2# Originally by Mark Shannon (mark@hotpy.org) 3# https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34 4 5import re 6from dataclasses import dataclass 7from collections.abc import Iterator 8 9 10def choice(*opts: str) -> str: 11 return "|".join("(%s)" % opt for opt in opts) 12 13 14# Regexes 15 16# Longer operators must go before shorter ones. 17 18PLUSPLUS = r"\+\+" 19MINUSMINUS = r"--" 20 21# -> 22ARROW = r"->" 23ELLIPSIS = r"\.\.\." 24 25# Assignment operators 26TIMESEQUAL = r"\*=" 27DIVEQUAL = r"/=" 28MODEQUAL = r"%=" 29PLUSEQUAL = r"\+=" 30MINUSEQUAL = r"-=" 31LSHIFTEQUAL = r"<<=" 32RSHIFTEQUAL = r">>=" 33ANDEQUAL = r"&=" 34OREQUAL = r"\|=" 35XOREQUAL = r"\^=" 36 37# Operators 38PLUS = r"\+" 39MINUS = r"-" 40TIMES = r"\*" 41DIVIDE = r"/" 42MOD = r"%" 43NOT = r"~" 44XOR = r"\^" 45LOR = r"\|\|" 46LAND = r"&&" 47LSHIFT = r"<<" 48RSHIFT = r">>" 49LE = r"<=" 50GE = r">=" 51EQ = r"==" 52NE = r"!=" 53LT = r"<" 54GT = r">" 55LNOT = r"!" 56OR = r"\|" 57AND = r"&" 58EQUALS = r"=" 59 60# ? 61CONDOP = r"\?" 62 63# Delimiters 64LPAREN = r"\(" 65RPAREN = r"\)" 66LBRACKET = r"\[" 67RBRACKET = r"\]" 68LBRACE = r"\{" 69RBRACE = r"\}" 70COMMA = r"," 71PERIOD = r"\." 72SEMI = r";" 73COLON = r":" 74BACKSLASH = r"\\" 75 76operators = {op: pattern for op, pattern in globals().items() if op == op.upper()} 77for op in operators: 78 globals()[op] = op 79opmap = {pattern.replace("\\", "") or "\\": op for op, pattern in operators.items()} 80 81# Macros 82macro = r"# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)" 83CMACRO = "CMACRO" 84 85id_re = r"[a-zA-Z_][0-9a-zA-Z_]*" 86IDENTIFIER = "IDENTIFIER" 87 88 89suffix = r"([uU]?[lL]?[lL]?)" 90octal = r"0[0-7]+" + suffix 91hex = r"0[xX][0-9a-fA-F]+" 92decimal_digits = r"(0|[1-9][0-9]*)" 93decimal = decimal_digits + suffix 94 95 96exponent = r"""([eE][-+]?[0-9]+)""" 97fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 98float = "((((" + fraction + ")" + exponent + "?)|([0-9]+" + exponent + "))[FfLl]?)" 99 100number_re = choice(octal, hex, float, decimal) 101NUMBER = "NUMBER" 102 103simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 104decimal_escape = r"""(\d+)""" 105hex_escape = r"""(x[0-9a-fA-F]+)""" 106escape_sequence = ( 107 r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))" 108) 109string_char = r"""([^"\\\n]|""" + escape_sequence + ")" 110str_re = '"' + string_char + '*"' 111STRING = "STRING" 112char = r"\'.\'" # TODO: escape sequence 113CHARACTER = "CHARACTER" 114 115comment_re = r"(//.*)|/\*([^*]|\*[^/])*\*/" 116COMMENT = "COMMENT" 117 118newline = r"\n" 119invalid = ( 120 r"\S" # A single non-space character that's not caught by any of the other patterns 121) 122matcher = re.compile( 123 choice( 124 id_re, 125 number_re, 126 str_re, 127 char, 128 newline, 129 macro, 130 comment_re, 131 *operators.values(), 132 invalid, 133 ) 134) 135letter = re.compile(r"[a-zA-Z_]") 136 137 138kwds = [] 139AUTO = "AUTO" 140kwds.append(AUTO) 141BREAK = "BREAK" 142kwds.append(BREAK) 143CASE = "CASE" 144kwds.append(CASE) 145CHAR = "CHAR" 146kwds.append(CHAR) 147CONST = "CONST" 148kwds.append(CONST) 149CONTINUE = "CONTINUE" 150kwds.append(CONTINUE) 151DEFAULT = "DEFAULT" 152kwds.append(DEFAULT) 153DO = "DO" 154kwds.append(DO) 155DOUBLE = "DOUBLE" 156kwds.append(DOUBLE) 157ELSE = "ELSE" 158kwds.append(ELSE) 159ENUM = "ENUM" 160kwds.append(ENUM) 161EXTERN = "EXTERN" 162kwds.append(EXTERN) 163FLOAT = "FLOAT" 164kwds.append(FLOAT) 165FOR = "FOR" 166kwds.append(FOR) 167GOTO = "GOTO" 168kwds.append(GOTO) 169IF = "IF" 170kwds.append(IF) 171INLINE = "INLINE" 172kwds.append(INLINE) 173INT = "INT" 174kwds.append(INT) 175LONG = "LONG" 176kwds.append(LONG) 177OFFSETOF = "OFFSETOF" 178kwds.append(OFFSETOF) 179RESTRICT = "RESTRICT" 180kwds.append(RESTRICT) 181RETURN = "RETURN" 182kwds.append(RETURN) 183SHORT = "SHORT" 184kwds.append(SHORT) 185SIGNED = "SIGNED" 186kwds.append(SIGNED) 187SIZEOF = "SIZEOF" 188kwds.append(SIZEOF) 189STATIC = "STATIC" 190kwds.append(STATIC) 191STRUCT = "STRUCT" 192kwds.append(STRUCT) 193SWITCH = "SWITCH" 194kwds.append(SWITCH) 195TYPEDEF = "TYPEDEF" 196kwds.append(TYPEDEF) 197UNION = "UNION" 198kwds.append(UNION) 199UNSIGNED = "UNSIGNED" 200kwds.append(UNSIGNED) 201VOID = "VOID" 202kwds.append(VOID) 203VOLATILE = "VOLATILE" 204kwds.append(VOLATILE) 205WHILE = "WHILE" 206kwds.append(WHILE) 207# An instruction in the DSL 208INST = "INST" 209kwds.append(INST) 210# A micro-op in the DSL 211OP = "OP" 212kwds.append(OP) 213# A macro in the DSL 214MACRO = "MACRO" 215kwds.append(MACRO) 216keywords = {name.lower(): name for name in kwds} 217 218ANNOTATION = "ANNOTATION" 219annotations = { 220 "specializing", 221 "override", 222 "register", 223 "replaced", 224 "pure", 225 "split", 226 "replicate", 227 "tier1", 228 "tier2", 229} 230 231__all__ = [] 232__all__.extend(kwds) 233 234 235def make_syntax_error( 236 message: str, 237 filename: str | None, 238 line: int, 239 column: int, 240 line_text: str, 241) -> SyntaxError: 242 return SyntaxError(message, (filename, line, column, line_text)) 243 244 245@dataclass(slots=True) 246class Token: 247 filename: str 248 kind: str 249 text: str 250 begin: tuple[int, int] 251 end: tuple[int, int] 252 253 @property 254 def line(self) -> int: 255 return self.begin[0] 256 257 @property 258 def column(self) -> int: 259 return self.begin[1] 260 261 @property 262 def end_line(self) -> int: 263 return self.end[0] 264 265 @property 266 def end_column(self) -> int: 267 return self.end[1] 268 269 @property 270 def width(self) -> int: 271 return self.end[1] - self.begin[1] 272 273 def replaceText(self, txt: str) -> "Token": 274 assert isinstance(txt, str) 275 return Token(self.filename, self.kind, txt, self.begin, self.end) 276 277 def __repr__(self) -> str: 278 b0, b1 = self.begin 279 e0, e1 = self.end 280 if b0 == e0: 281 return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})" 282 else: 283 return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})" 284 285 286def tokenize(src: str, line: int = 1, filename: str = "") -> Iterator[Token]: 287 linestart = -1 288 for m in matcher.finditer(src): 289 start, end = m.span() 290 text = m.group(0) 291 if text in keywords: 292 kind = keywords[text] 293 elif text in annotations: 294 kind = ANNOTATION 295 elif letter.match(text): 296 kind = IDENTIFIER 297 elif text == "...": 298 kind = ELLIPSIS 299 elif text == ".": 300 kind = PERIOD 301 elif text[0] in "0123456789.": 302 kind = NUMBER 303 elif text[0] == '"': 304 kind = STRING 305 elif text in opmap: 306 kind = opmap[text] 307 elif text == "\n": 308 linestart = start 309 line += 1 310 kind = "\n" 311 elif text[0] == "'": 312 kind = CHARACTER 313 elif text[0] == "#": 314 kind = CMACRO 315 elif text[0] == "/" and text[1] in "/*": 316 kind = COMMENT 317 else: 318 lineend = src.find("\n", start) 319 if lineend == -1: 320 lineend = len(src) 321 raise make_syntax_error( 322 f"Bad token: {text}", 323 filename, 324 line, 325 start - linestart + 1, 326 src[linestart:lineend], 327 ) 328 if kind == COMMENT: 329 begin = line, start - linestart 330 newlines = text.count("\n") 331 if newlines: 332 linestart = start + text.rfind("\n") 333 line += newlines 334 else: 335 begin = line, start - linestart 336 if kind != "\n": 337 yield Token( 338 filename, kind, text, begin, (line, start - linestart + len(text)) 339 ) 340 341 342def to_text(tkns: list[Token], dedent: int = 0) -> str: 343 res: list[str] = [] 344 line, col = -1, 1 + dedent 345 for tkn in tkns: 346 if line == -1: 347 line, _ = tkn.begin 348 l, c = tkn.begin 349 # assert(l >= line), (line, txt, start, end) 350 while l > line: 351 line += 1 352 res.append("\n") 353 col = 1 + dedent 354 res.append(" " * (c - col)) 355 text = tkn.text 356 if dedent != 0 and tkn.kind == "COMMENT" and "\n" in text: 357 if dedent < 0: 358 text = text.replace("\n", "\n" + " " * -dedent) 359 # TODO: dedent > 0 360 res.append(text) 361 line, col = tkn.end 362 return "".join(res) 363 364 365if __name__ == "__main__": 366 import sys 367 368 filename = sys.argv[1] 369 if filename == "-c": 370 src = sys.argv[2] 371 else: 372 src = open(filename).read() 373 # print(to_text(tokenize(src))) 374 for tkn in tokenize(src, filename=filename): 375 print(tkn) 376