• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import itertools
2import logging
3import os
4import pathlib
5import sys
6import sysconfig
7import tempfile
8import tokenize
9from typing import IO, Any, Dict, List, Optional, Set, Tuple
10
11from pegen.c_generator import CParserGenerator
12from pegen.grammar import Grammar
13from pegen.grammar_parser import GeneratedParser as GrammarParser
14from pegen.parser import Parser
15from pegen.parser_generator import ParserGenerator
16from pegen.python_generator import PythonParserGenerator
17from pegen.tokenizer import Tokenizer
18
19MOD_DIR = pathlib.Path(__file__).resolve().parent
20
21TokenDefinitions = Tuple[Dict[int, str], Dict[str, int], Set[str]]
22Incomplete = Any  # TODO: install `types-setuptools` and remove this alias
23
24
25def get_extra_flags(compiler_flags: str, compiler_py_flags_nodist: str) -> List[str]:
26    flags = sysconfig.get_config_var(compiler_flags)
27    py_flags_nodist = sysconfig.get_config_var(compiler_py_flags_nodist)
28    if flags is None or py_flags_nodist is None:
29        return []
30    return f"{flags} {py_flags_nodist}".split()
31
32
33def fixup_build_ext(cmd: Incomplete) -> None:
34    """Function needed to make build_ext tests pass.
35
36    When Python was built with --enable-shared on Unix, -L. is not enough to
37    find libpython<blah>.so, because regrtest runs in a tempdir, not in the
38    source directory where the .so lives.
39
40    When Python was built with in debug mode on Windows, build_ext commands
41    need their debug attribute set, and it is not done automatically for
42    some reason.
43
44    This function handles both of these things.  Example use:
45
46        cmd = build_ext(dist)
47        support.fixup_build_ext(cmd)
48        cmd.ensure_finalized()
49
50    Unlike most other Unix platforms, Mac OS X embeds absolute paths
51    to shared libraries into executables, so the fixup is not needed there.
52
53    Taken from distutils (was part of the CPython stdlib until Python 3.11)
54    """
55    if os.name == "nt":
56        cmd.debug = sys.executable.endswith("_d.exe")
57    elif sysconfig.get_config_var("Py_ENABLE_SHARED"):
58        # To further add to the shared builds fun on Unix, we can't just add
59        # library_dirs to the Extension() instance because that doesn't get
60        # plumbed through to the final compiler command.
61        runshared = sysconfig.get_config_var("RUNSHARED")
62        if runshared is None:
63            cmd.library_dirs = ["."]
64        else:
65            if sys.platform == "darwin":
66                cmd.library_dirs = []
67            else:
68                name, equals, value = runshared.partition("=")
69                cmd.library_dirs = [d for d in value.split(os.pathsep) if d]
70
71
72def compile_c_extension(
73    generated_source_path: str,
74    build_dir: Optional[str] = None,
75    verbose: bool = False,
76    keep_asserts: bool = True,
77    disable_optimization: bool = False,
78    library_dir: Optional[str] = None,
79) -> pathlib.Path:
80    """Compile the generated source for a parser generator into an extension module.
81
82    The extension module will be generated in the same directory as the provided path
83    for the generated source, with the same basename (in addition to extension module
84    metadata). For example, for the source mydir/parser.c the generated extension
85    in a darwin system with python 3.8 will be mydir/parser.cpython-38-darwin.so.
86
87    If *build_dir* is provided, that path will be used as the temporary build directory
88    of distutils (this is useful in case you want to use a temporary directory).
89
90    If *library_dir* is provided, that path will be used as the directory for a
91    static library of the common parser sources (this is useful in case you are
92    creating multiple extensions).
93    """
94    import setuptools.command.build_ext
95    import setuptools.logging
96
97    from setuptools import Extension, Distribution
98    from setuptools._distutils.dep_util import newer_group
99    from setuptools._distutils.ccompiler import new_compiler
100    from setuptools._distutils.sysconfig import customize_compiler
101
102    if verbose:
103        setuptools.logging.set_threshold(logging.DEBUG)
104
105    source_file_path = pathlib.Path(generated_source_path)
106    extension_name = source_file_path.stem
107    extra_compile_args = get_extra_flags("CFLAGS", "PY_CFLAGS_NODIST")
108    extra_compile_args.append("-DPy_BUILD_CORE_MODULE")
109    # Define _Py_TEST_PEGEN to not call PyAST_Validate() in Parser/pegen.c
110    extra_compile_args.append("-D_Py_TEST_PEGEN")
111    extra_link_args = get_extra_flags("LDFLAGS", "PY_LDFLAGS_NODIST")
112    if keep_asserts:
113        extra_compile_args.append("-UNDEBUG")
114    if disable_optimization:
115        if sys.platform == "win32":
116            extra_compile_args.append("/Od")
117            extra_link_args.append("/LTCG:OFF")
118        else:
119            extra_compile_args.append("-O0")
120            if sysconfig.get_config_var("GNULD") == "yes":
121                extra_link_args.append("-fno-lto")
122
123    common_sources = [
124        str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"),
125        str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"),
126        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "lexer.c"),
127        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "state.c"),
128        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "buffer.c"),
129        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "string_tokenizer.c"),
130        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "file_tokenizer.c"),
131        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "utf8_tokenizer.c"),
132        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "readline_tokenizer.c"),
133        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "helpers.c"),
134        str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"),
135        str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"),
136        str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"),
137        str(MOD_DIR.parent.parent.parent / "Parser" / "string_parser.c"),
138        str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"),
139    ]
140    include_dirs = [
141        str(MOD_DIR.parent.parent.parent / "Include" / "internal"),
142        str(MOD_DIR.parent.parent.parent / "Include" / "internal" / "mimalloc"),
143        str(MOD_DIR.parent.parent.parent / "Parser"),
144        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer"),
145        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer"),
146    ]
147    if sys.platform == "win32":
148        # HACK: The location of pyconfig.h has moved within our build, and
149        # setuptools hasn't updated for it yet. So add the path manually for now
150        include_dirs.append(pathlib.Path(sysconfig.get_config_h_filename()).parent)
151    extension = Extension(
152        extension_name,
153        sources=[generated_source_path],
154        extra_compile_args=extra_compile_args,
155        extra_link_args=extra_link_args,
156    )
157    dist = Distribution({"name": extension_name, "ext_modules": [extension]})
158    cmd = dist.get_command_obj("build_ext")
159    assert isinstance(cmd, setuptools.command.build_ext.build_ext)
160    fixup_build_ext(cmd)
161    cmd.build_lib = str(source_file_path.parent)
162    cmd.include_dirs = include_dirs
163    if build_dir:
164        cmd.build_temp = build_dir
165    cmd.ensure_finalized()
166
167    compiler = new_compiler()
168    customize_compiler(compiler)
169    compiler.set_include_dirs(cmd.include_dirs)
170    compiler.set_library_dirs(cmd.library_dirs)
171    # build static lib
172    if library_dir:
173        library_filename = compiler.library_filename(extension_name, output_dir=library_dir)
174        if newer_group(common_sources, library_filename, "newer"):
175            if sys.platform == "win32":
176                assert compiler.static_lib_format
177                pdb = compiler.static_lib_format % (extension_name, ".pdb")
178                compile_opts = [f"/Fd{library_dir}\\{pdb}"]
179                compile_opts.extend(extra_compile_args)
180            else:
181                compile_opts = extra_compile_args
182            objects = compiler.compile(
183                common_sources,
184                output_dir=library_dir,
185                debug=cmd.debug,
186                extra_postargs=compile_opts,
187            )
188            compiler.create_static_lib(
189                objects, extension_name, output_dir=library_dir, debug=cmd.debug
190            )
191        if sys.platform == "win32":
192            compiler.add_library_dir(library_dir)
193            extension.libraries = [extension_name]
194        elif sys.platform == "darwin":
195            compiler.set_link_objects(
196                [
197                    "-Wl,-force_load",
198                    library_filename,
199                ]
200            )
201        else:
202            compiler.set_link_objects(
203                [
204                    "-Wl,--whole-archive",
205                    library_filename,
206                    "-Wl,--no-whole-archive",
207                ]
208            )
209    else:
210        extension.sources[0:0] = common_sources
211
212    # Compile the source code to object files.
213    ext_path = cmd.get_ext_fullpath(extension_name)
214    if newer_group(extension.sources, ext_path, "newer"):
215        objects = compiler.compile(
216            extension.sources,
217            output_dir=cmd.build_temp,
218            debug=cmd.debug,
219            extra_postargs=extra_compile_args,
220        )
221    else:
222        objects = compiler.object_filenames(extension.sources, output_dir=cmd.build_temp)
223    # The cmd.get_libraries() call needs a valid compiler attribute or we will
224    # get an incorrect library name on the free-threaded Windows build.
225    cmd.compiler = compiler
226    # Now link the object files together into a "shared object"
227    compiler.link_shared_object(
228        objects,
229        ext_path,
230        libraries=cmd.get_libraries(extension),
231        extra_postargs=extra_link_args,
232        export_symbols=cmd.get_export_symbols(extension),  # type: ignore[no-untyped-call]
233        debug=cmd.debug,
234        build_temp=cmd.build_temp,
235    )
236
237    return pathlib.Path(ext_path)
238
239
240def build_parser(
241    grammar_file: str, verbose_tokenizer: bool = False, verbose_parser: bool = False
242) -> Tuple[Grammar, Parser, Tokenizer]:
243    with open(grammar_file) as file:
244        tokenizer = Tokenizer(tokenize.generate_tokens(file.readline), verbose=verbose_tokenizer)
245        parser = GrammarParser(tokenizer, verbose=verbose_parser)
246        grammar = parser.start()
247
248        if not grammar:
249            raise parser.make_syntax_error(grammar_file)
250
251    return grammar, parser, tokenizer
252
253
254def generate_token_definitions(tokens: IO[str]) -> TokenDefinitions:
255    all_tokens = {}
256    exact_tokens = {}
257    non_exact_tokens = set()
258    numbers = itertools.count(0)
259
260    for line in tokens:
261        line = line.strip()
262
263        if not line or line.startswith("#"):
264            continue
265
266        pieces = line.split()
267        index = next(numbers)
268
269        if len(pieces) == 1:
270            (token,) = pieces
271            non_exact_tokens.add(token)
272            all_tokens[index] = token
273        elif len(pieces) == 2:
274            token, op = pieces
275            exact_tokens[op.strip("'")] = index
276            all_tokens[index] = token
277        else:
278            raise ValueError(f"Unexpected line found in Tokens file: {line}")
279
280    return all_tokens, exact_tokens, non_exact_tokens
281
282
283def build_c_generator(
284    grammar: Grammar,
285    grammar_file: str,
286    tokens_file: str,
287    output_file: str,
288    compile_extension: bool = False,
289    verbose_c_extension: bool = False,
290    keep_asserts_in_extension: bool = True,
291    skip_actions: bool = False,
292) -> ParserGenerator:
293    with open(tokens_file, "r") as tok_file:
294        all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file)
295    with open(output_file, "w") as file:
296        gen: ParserGenerator = CParserGenerator(
297            grammar, all_tokens, exact_tok, non_exact_tok, file, skip_actions=skip_actions
298        )
299        gen.generate(grammar_file)
300
301    if compile_extension:
302        with tempfile.TemporaryDirectory() as build_dir:
303            compile_c_extension(
304                output_file,
305                build_dir=build_dir,
306                verbose=verbose_c_extension,
307                keep_asserts=keep_asserts_in_extension,
308            )
309    return gen
310
311
312def build_python_generator(
313    grammar: Grammar,
314    grammar_file: str,
315    output_file: str,
316    skip_actions: bool = False,
317) -> ParserGenerator:
318    with open(output_file, "w") as file:
319        gen: ParserGenerator = PythonParserGenerator(grammar, file)  # TODO: skip_actions
320        gen.generate(grammar_file)
321    return gen
322
323
324def build_c_parser_and_generator(
325    grammar_file: str,
326    tokens_file: str,
327    output_file: str,
328    compile_extension: bool = False,
329    verbose_tokenizer: bool = False,
330    verbose_parser: bool = False,
331    verbose_c_extension: bool = False,
332    keep_asserts_in_extension: bool = True,
333    skip_actions: bool = False,
334) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
335    """Generate rules, C parser, tokenizer, parser generator for a given grammar
336
337    Args:
338        grammar_file (string): Path for the grammar file
339        tokens_file (string): Path for the tokens file
340        output_file (string): Path for the output file
341        compile_extension (bool, optional): Whether to compile the C extension.
342          Defaults to False.
343        verbose_tokenizer (bool, optional): Whether to display additional output
344          when generating the tokenizer. Defaults to False.
345        verbose_parser (bool, optional): Whether to display additional output
346          when generating the parser. Defaults to False.
347        verbose_c_extension (bool, optional): Whether to display additional
348          output when compiling the C extension . Defaults to False.
349        keep_asserts_in_extension (bool, optional): Whether to keep the assert statements
350          when compiling the extension module. Defaults to True.
351        skip_actions (bool, optional): Whether to pretend no rule has any actions.
352    """
353    grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser)
354    gen = build_c_generator(
355        grammar,
356        grammar_file,
357        tokens_file,
358        output_file,
359        compile_extension,
360        verbose_c_extension,
361        keep_asserts_in_extension,
362        skip_actions=skip_actions,
363    )
364
365    return grammar, parser, tokenizer, gen
366
367
368def build_python_parser_and_generator(
369    grammar_file: str,
370    output_file: str,
371    verbose_tokenizer: bool = False,
372    verbose_parser: bool = False,
373    skip_actions: bool = False,
374) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
375    """Generate rules, python parser, tokenizer, parser generator for a given grammar
376
377    Args:
378        grammar_file (string): Path for the grammar file
379        output_file (string): Path for the output file
380        verbose_tokenizer (bool, optional): Whether to display additional output
381          when generating the tokenizer. Defaults to False.
382        verbose_parser (bool, optional): Whether to display additional output
383          when generating the parser. Defaults to False.
384        skip_actions (bool, optional): Whether to pretend no rule has any actions.
385    """
386    grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser)
387    gen = build_python_generator(
388        grammar,
389        grammar_file,
390        output_file,
391        skip_actions=skip_actions,
392    )
393    return grammar, parser, tokenizer, gen
394