1import itertools 2import logging 3import os 4import pathlib 5import sys 6import sysconfig 7import tempfile 8import tokenize 9from typing import IO, Any, Dict, List, Optional, Set, Tuple 10 11from pegen.c_generator import CParserGenerator 12from pegen.grammar import Grammar 13from pegen.grammar_parser import GeneratedParser as GrammarParser 14from pegen.parser import Parser 15from pegen.parser_generator import ParserGenerator 16from pegen.python_generator import PythonParserGenerator 17from pegen.tokenizer import Tokenizer 18 19MOD_DIR = pathlib.Path(__file__).resolve().parent 20 21TokenDefinitions = Tuple[Dict[int, str], Dict[str, int], Set[str]] 22Incomplete = Any # TODO: install `types-setuptools` and remove this alias 23 24 25def get_extra_flags(compiler_flags: str, compiler_py_flags_nodist: str) -> List[str]: 26 flags = sysconfig.get_config_var(compiler_flags) 27 py_flags_nodist = sysconfig.get_config_var(compiler_py_flags_nodist) 28 if flags is None or py_flags_nodist is None: 29 return [] 30 return f"{flags} {py_flags_nodist}".split() 31 32 33def fixup_build_ext(cmd: Incomplete) -> None: 34 """Function needed to make build_ext tests pass. 35 36 When Python was built with --enable-shared on Unix, -L. is not enough to 37 find libpython<blah>.so, because regrtest runs in a tempdir, not in the 38 source directory where the .so lives. 39 40 When Python was built with in debug mode on Windows, build_ext commands 41 need their debug attribute set, and it is not done automatically for 42 some reason. 43 44 This function handles both of these things. Example use: 45 46 cmd = build_ext(dist) 47 support.fixup_build_ext(cmd) 48 cmd.ensure_finalized() 49 50 Unlike most other Unix platforms, Mac OS X embeds absolute paths 51 to shared libraries into executables, so the fixup is not needed there. 52 53 Taken from distutils (was part of the CPython stdlib until Python 3.11) 54 """ 55 if os.name == "nt": 56 cmd.debug = sys.executable.endswith("_d.exe") 57 elif sysconfig.get_config_var("Py_ENABLE_SHARED"): 58 # To further add to the shared builds fun on Unix, we can't just add 59 # library_dirs to the Extension() instance because that doesn't get 60 # plumbed through to the final compiler command. 61 runshared = sysconfig.get_config_var("RUNSHARED") 62 if runshared is None: 63 cmd.library_dirs = ["."] 64 else: 65 if sys.platform == "darwin": 66 cmd.library_dirs = [] 67 else: 68 name, equals, value = runshared.partition("=") 69 cmd.library_dirs = [d for d in value.split(os.pathsep) if d] 70 71 72def compile_c_extension( 73 generated_source_path: str, 74 build_dir: Optional[str] = None, 75 verbose: bool = False, 76 keep_asserts: bool = True, 77 disable_optimization: bool = False, 78 library_dir: Optional[str] = None, 79) -> pathlib.Path: 80 """Compile the generated source for a parser generator into an extension module. 81 82 The extension module will be generated in the same directory as the provided path 83 for the generated source, with the same basename (in addition to extension module 84 metadata). For example, for the source mydir/parser.c the generated extension 85 in a darwin system with python 3.8 will be mydir/parser.cpython-38-darwin.so. 86 87 If *build_dir* is provided, that path will be used as the temporary build directory 88 of distutils (this is useful in case you want to use a temporary directory). 89 90 If *library_dir* is provided, that path will be used as the directory for a 91 static library of the common parser sources (this is useful in case you are 92 creating multiple extensions). 93 """ 94 import setuptools.command.build_ext 95 import setuptools.logging 96 97 from setuptools import Extension, Distribution 98 from setuptools._distutils.dep_util import newer_group 99 from setuptools._distutils.ccompiler import new_compiler 100 from setuptools._distutils.sysconfig import customize_compiler 101 102 if verbose: 103 setuptools.logging.set_threshold(logging.DEBUG) 104 105 source_file_path = pathlib.Path(generated_source_path) 106 extension_name = source_file_path.stem 107 extra_compile_args = get_extra_flags("CFLAGS", "PY_CFLAGS_NODIST") 108 extra_compile_args.append("-DPy_BUILD_CORE_MODULE") 109 # Define _Py_TEST_PEGEN to not call PyAST_Validate() in Parser/pegen.c 110 extra_compile_args.append("-D_Py_TEST_PEGEN") 111 extra_link_args = get_extra_flags("LDFLAGS", "PY_LDFLAGS_NODIST") 112 if keep_asserts: 113 extra_compile_args.append("-UNDEBUG") 114 if disable_optimization: 115 if sys.platform == "win32": 116 extra_compile_args.append("/Od") 117 extra_link_args.append("/LTCG:OFF") 118 else: 119 extra_compile_args.append("-O0") 120 if sysconfig.get_config_var("GNULD") == "yes": 121 extra_link_args.append("-fno-lto") 122 123 common_sources = [ 124 str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"), 125 str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"), 126 str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "lexer.c"), 127 str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "state.c"), 128 str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "buffer.c"), 129 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "string_tokenizer.c"), 130 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "file_tokenizer.c"), 131 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "utf8_tokenizer.c"), 132 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "readline_tokenizer.c"), 133 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "helpers.c"), 134 str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"), 135 str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"), 136 str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"), 137 str(MOD_DIR.parent.parent.parent / "Parser" / "string_parser.c"), 138 str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"), 139 ] 140 include_dirs = [ 141 str(MOD_DIR.parent.parent.parent / "Include" / "internal"), 142 str(MOD_DIR.parent.parent.parent / "Include" / "internal" / "mimalloc"), 143 str(MOD_DIR.parent.parent.parent / "Parser"), 144 str(MOD_DIR.parent.parent.parent / "Parser" / "lexer"), 145 str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer"), 146 ] 147 if sys.platform == "win32": 148 # HACK: The location of pyconfig.h has moved within our build, and 149 # setuptools hasn't updated for it yet. So add the path manually for now 150 include_dirs.append(pathlib.Path(sysconfig.get_config_h_filename()).parent) 151 extension = Extension( 152 extension_name, 153 sources=[generated_source_path], 154 extra_compile_args=extra_compile_args, 155 extra_link_args=extra_link_args, 156 ) 157 dist = Distribution({"name": extension_name, "ext_modules": [extension]}) 158 cmd = dist.get_command_obj("build_ext") 159 assert isinstance(cmd, setuptools.command.build_ext.build_ext) 160 fixup_build_ext(cmd) 161 cmd.build_lib = str(source_file_path.parent) 162 cmd.include_dirs = include_dirs 163 if build_dir: 164 cmd.build_temp = build_dir 165 cmd.ensure_finalized() 166 167 compiler = new_compiler() 168 customize_compiler(compiler) 169 compiler.set_include_dirs(cmd.include_dirs) 170 compiler.set_library_dirs(cmd.library_dirs) 171 # build static lib 172 if library_dir: 173 library_filename = compiler.library_filename(extension_name, output_dir=library_dir) 174 if newer_group(common_sources, library_filename, "newer"): 175 if sys.platform == "win32": 176 assert compiler.static_lib_format 177 pdb = compiler.static_lib_format % (extension_name, ".pdb") 178 compile_opts = [f"/Fd{library_dir}\\{pdb}"] 179 compile_opts.extend(extra_compile_args) 180 else: 181 compile_opts = extra_compile_args 182 objects = compiler.compile( 183 common_sources, 184 output_dir=library_dir, 185 debug=cmd.debug, 186 extra_postargs=compile_opts, 187 ) 188 compiler.create_static_lib( 189 objects, extension_name, output_dir=library_dir, debug=cmd.debug 190 ) 191 if sys.platform == "win32": 192 compiler.add_library_dir(library_dir) 193 extension.libraries = [extension_name] 194 elif sys.platform == "darwin": 195 compiler.set_link_objects( 196 [ 197 "-Wl,-force_load", 198 library_filename, 199 ] 200 ) 201 else: 202 compiler.set_link_objects( 203 [ 204 "-Wl,--whole-archive", 205 library_filename, 206 "-Wl,--no-whole-archive", 207 ] 208 ) 209 else: 210 extension.sources[0:0] = common_sources 211 212 # Compile the source code to object files. 213 ext_path = cmd.get_ext_fullpath(extension_name) 214 if newer_group(extension.sources, ext_path, "newer"): 215 objects = compiler.compile( 216 extension.sources, 217 output_dir=cmd.build_temp, 218 debug=cmd.debug, 219 extra_postargs=extra_compile_args, 220 ) 221 else: 222 objects = compiler.object_filenames(extension.sources, output_dir=cmd.build_temp) 223 # The cmd.get_libraries() call needs a valid compiler attribute or we will 224 # get an incorrect library name on the free-threaded Windows build. 225 cmd.compiler = compiler 226 # Now link the object files together into a "shared object" 227 compiler.link_shared_object( 228 objects, 229 ext_path, 230 libraries=cmd.get_libraries(extension), 231 extra_postargs=extra_link_args, 232 export_symbols=cmd.get_export_symbols(extension), # type: ignore[no-untyped-call] 233 debug=cmd.debug, 234 build_temp=cmd.build_temp, 235 ) 236 237 return pathlib.Path(ext_path) 238 239 240def build_parser( 241 grammar_file: str, verbose_tokenizer: bool = False, verbose_parser: bool = False 242) -> Tuple[Grammar, Parser, Tokenizer]: 243 with open(grammar_file) as file: 244 tokenizer = Tokenizer(tokenize.generate_tokens(file.readline), verbose=verbose_tokenizer) 245 parser = GrammarParser(tokenizer, verbose=verbose_parser) 246 grammar = parser.start() 247 248 if not grammar: 249 raise parser.make_syntax_error(grammar_file) 250 251 return grammar, parser, tokenizer 252 253 254def generate_token_definitions(tokens: IO[str]) -> TokenDefinitions: 255 all_tokens = {} 256 exact_tokens = {} 257 non_exact_tokens = set() 258 numbers = itertools.count(0) 259 260 for line in tokens: 261 line = line.strip() 262 263 if not line or line.startswith("#"): 264 continue 265 266 pieces = line.split() 267 index = next(numbers) 268 269 if len(pieces) == 1: 270 (token,) = pieces 271 non_exact_tokens.add(token) 272 all_tokens[index] = token 273 elif len(pieces) == 2: 274 token, op = pieces 275 exact_tokens[op.strip("'")] = index 276 all_tokens[index] = token 277 else: 278 raise ValueError(f"Unexpected line found in Tokens file: {line}") 279 280 return all_tokens, exact_tokens, non_exact_tokens 281 282 283def build_c_generator( 284 grammar: Grammar, 285 grammar_file: str, 286 tokens_file: str, 287 output_file: str, 288 compile_extension: bool = False, 289 verbose_c_extension: bool = False, 290 keep_asserts_in_extension: bool = True, 291 skip_actions: bool = False, 292) -> ParserGenerator: 293 with open(tokens_file, "r") as tok_file: 294 all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file) 295 with open(output_file, "w") as file: 296 gen: ParserGenerator = CParserGenerator( 297 grammar, all_tokens, exact_tok, non_exact_tok, file, skip_actions=skip_actions 298 ) 299 gen.generate(grammar_file) 300 301 if compile_extension: 302 with tempfile.TemporaryDirectory() as build_dir: 303 compile_c_extension( 304 output_file, 305 build_dir=build_dir, 306 verbose=verbose_c_extension, 307 keep_asserts=keep_asserts_in_extension, 308 ) 309 return gen 310 311 312def build_python_generator( 313 grammar: Grammar, 314 grammar_file: str, 315 output_file: str, 316 skip_actions: bool = False, 317) -> ParserGenerator: 318 with open(output_file, "w") as file: 319 gen: ParserGenerator = PythonParserGenerator(grammar, file) # TODO: skip_actions 320 gen.generate(grammar_file) 321 return gen 322 323 324def build_c_parser_and_generator( 325 grammar_file: str, 326 tokens_file: str, 327 output_file: str, 328 compile_extension: bool = False, 329 verbose_tokenizer: bool = False, 330 verbose_parser: bool = False, 331 verbose_c_extension: bool = False, 332 keep_asserts_in_extension: bool = True, 333 skip_actions: bool = False, 334) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: 335 """Generate rules, C parser, tokenizer, parser generator for a given grammar 336 337 Args: 338 grammar_file (string): Path for the grammar file 339 tokens_file (string): Path for the tokens file 340 output_file (string): Path for the output file 341 compile_extension (bool, optional): Whether to compile the C extension. 342 Defaults to False. 343 verbose_tokenizer (bool, optional): Whether to display additional output 344 when generating the tokenizer. Defaults to False. 345 verbose_parser (bool, optional): Whether to display additional output 346 when generating the parser. Defaults to False. 347 verbose_c_extension (bool, optional): Whether to display additional 348 output when compiling the C extension . Defaults to False. 349 keep_asserts_in_extension (bool, optional): Whether to keep the assert statements 350 when compiling the extension module. Defaults to True. 351 skip_actions (bool, optional): Whether to pretend no rule has any actions. 352 """ 353 grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) 354 gen = build_c_generator( 355 grammar, 356 grammar_file, 357 tokens_file, 358 output_file, 359 compile_extension, 360 verbose_c_extension, 361 keep_asserts_in_extension, 362 skip_actions=skip_actions, 363 ) 364 365 return grammar, parser, tokenizer, gen 366 367 368def build_python_parser_and_generator( 369 grammar_file: str, 370 output_file: str, 371 verbose_tokenizer: bool = False, 372 verbose_parser: bool = False, 373 skip_actions: bool = False, 374) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: 375 """Generate rules, python parser, tokenizer, parser generator for a given grammar 376 377 Args: 378 grammar_file (string): Path for the grammar file 379 output_file (string): Path for the output file 380 verbose_tokenizer (bool, optional): Whether to display additional output 381 when generating the tokenizer. Defaults to False. 382 verbose_parser (bool, optional): Whether to display additional output 383 when generating the parser. Defaults to False. 384 skip_actions (bool, optional): Whether to pretend no rule has any actions. 385 """ 386 grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) 387 gen = build_python_generator( 388 grammar, 389 grammar_file, 390 output_file, 391 skip_actions=skip_actions, 392 ) 393 return grammar, parser, tokenizer, gen 394