• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse a Python module and describe its classes and functions.
2
3Parse enough of a Python file to recognize imports and class and
4function definitions, and to find out the superclasses of a class.
5
6The interface consists of a single function:
7    readmodule_ex(module, path=None)
8where module is the name of a Python module, and path is an optional
9list of directories where the module is to be searched.  If present,
10path is prepended to the system search path sys.path.  The return value
11is a dictionary.  The keys of the dictionary are the names of the
12classes and functions defined in the module (including classes that are
13defined via the from XXX import YYY construct).  The values are
14instances of classes Class and Function.  One special key/value pair is
15present for packages: the key '__path__' has a list as its value which
16contains the package search path.
17
18Classes and Functions have a common superclass: _Object.  Every instance
19has the following attributes:
20    module  -- name of the module;
21    name    -- name of the object;
22    file    -- file in which the object is defined;
23    lineno  -- line in the file where the object's definition starts;
24    parent  -- parent of this object, if any;
25    children -- nested objects contained in this object.
26The 'children' attribute is a dictionary mapping names to objects.
27
28Instances of Function describe functions with the attributes from _Object.
29
30Instances of Class describe classes with the attributes from _Object,
31plus the following:
32    super   -- list of super classes (Class instances if possible);
33    methods -- mapping of method names to beginning line numbers.
34If the name of a super class is not recognized, the corresponding
35entry in the list of super classes is not a class instance but a
36string giving the name of the super class.  Since import statements
37are recognized and imported modules are scanned as well, this
38shouldn't happen often.
39"""
40
41import io
42import sys
43import importlib.util
44import tokenize
45from token import NAME, DEDENT, OP
46
47__all__ = ["readmodule", "readmodule_ex", "Class", "Function"]
48
49_modules = {}  # Initialize cache of modules we've seen.
50
51
52class _Object:
53    "Information about Python class or function."
54    def __init__(self, module, name, file, lineno, parent):
55        self.module = module
56        self.name = name
57        self.file = file
58        self.lineno = lineno
59        self.parent = parent
60        self.children = {}
61
62    def _addchild(self, name, obj):
63        self.children[name] = obj
64
65
66class Function(_Object):
67    "Information about a Python function, including methods."
68    def __init__(self, module, name, file, lineno, parent=None):
69        _Object.__init__(self, module, name, file, lineno, parent)
70
71
72class Class(_Object):
73    "Information about a Python class."
74    def __init__(self, module, name, super, file, lineno, parent=None):
75        _Object.__init__(self, module, name, file, lineno, parent)
76        self.super = [] if super is None else super
77        self.methods = {}
78
79    def _addmethod(self, name, lineno):
80        self.methods[name] = lineno
81
82
83def _nest_function(ob, func_name, lineno):
84    "Return a Function after nesting within ob."
85    newfunc = Function(ob.module, func_name, ob.file, lineno, ob)
86    ob._addchild(func_name, newfunc)
87    if isinstance(ob, Class):
88        ob._addmethod(func_name, lineno)
89    return newfunc
90
91def _nest_class(ob, class_name, lineno, super=None):
92    "Return a Class after nesting within ob."
93    newclass = Class(ob.module, class_name, super, ob.file, lineno, ob)
94    ob._addchild(class_name, newclass)
95    return newclass
96
97def readmodule(module, path=None):
98    """Return Class objects for the top-level classes in module.
99
100    This is the original interface, before Functions were added.
101    """
102
103    res = {}
104    for key, value in _readmodule(module, path or []).items():
105        if isinstance(value, Class):
106            res[key] = value
107    return res
108
109def readmodule_ex(module, path=None):
110    """Return a dictionary with all functions and classes in module.
111
112    Search for module in PATH + sys.path.
113    If possible, include imported superclasses.
114    Do this by reading source, without importing (and executing) it.
115    """
116    return _readmodule(module, path or [])
117
118def _readmodule(module, path, inpackage=None):
119    """Do the hard work for readmodule[_ex].
120
121    If inpackage is given, it must be the dotted name of the package in
122    which we are searching for a submodule, and then PATH must be the
123    package search path; otherwise, we are searching for a top-level
124    module, and path is combined with sys.path.
125    """
126    # Compute the full module name (prepending inpackage if set).
127    if inpackage is not None:
128        fullmodule = "%s.%s" % (inpackage, module)
129    else:
130        fullmodule = module
131
132    # Check in the cache.
133    if fullmodule in _modules:
134        return _modules[fullmodule]
135
136    # Initialize the dict for this module's contents.
137    tree = {}
138
139    # Check if it is a built-in module; we don't do much for these.
140    if module in sys.builtin_module_names and inpackage is None:
141        _modules[module] = tree
142        return tree
143
144    # Check for a dotted module name.
145    i = module.rfind('.')
146    if i >= 0:
147        package = module[:i]
148        submodule = module[i+1:]
149        parent = _readmodule(package, path, inpackage)
150        if inpackage is not None:
151            package = "%s.%s" % (inpackage, package)
152        if not '__path__' in parent:
153            raise ImportError('No package named {}'.format(package))
154        return _readmodule(submodule, parent['__path__'], package)
155
156    # Search the path for the module.
157    f = None
158    if inpackage is not None:
159        search_path = path
160    else:
161        search_path = path + sys.path
162    spec = importlib.util._find_spec_from_path(fullmodule, search_path)
163    if spec is None:
164        raise ModuleNotFoundError(f"no module named {fullmodule!r}", name=fullmodule)
165    _modules[fullmodule] = tree
166    # Is module a package?
167    if spec.submodule_search_locations is not None:
168        tree['__path__'] = spec.submodule_search_locations
169    try:
170        source = spec.loader.get_source(fullmodule)
171    except (AttributeError, ImportError):
172        # If module is not Python source, we cannot do anything.
173        return tree
174    else:
175        if source is None:
176            return tree
177
178    fname = spec.loader.get_filename(fullmodule)
179    return _create_tree(fullmodule, path, fname, source, tree, inpackage)
180
181
182def _create_tree(fullmodule, path, fname, source, tree, inpackage):
183    """Return the tree for a particular module.
184
185    fullmodule (full module name), inpackage+module, becomes o.module.
186    path is passed to recursive calls of _readmodule.
187    fname becomes o.file.
188    source is tokenized.  Imports cause recursive calls to _readmodule.
189    tree is {} or {'__path__': <submodule search locations>}.
190    inpackage, None or string, is passed to recursive calls of _readmodule.
191
192    The effect of recursive calls is mutation of global _modules.
193    """
194    f = io.StringIO(source)
195
196    stack = [] # Initialize stack of (class, indent) pairs.
197
198    g = tokenize.generate_tokens(f.readline)
199    try:
200        for tokentype, token, start, _end, _line in g:
201            if tokentype == DEDENT:
202                lineno, thisindent = start
203                # Close previous nested classes and defs.
204                while stack and stack[-1][1] >= thisindent:
205                    del stack[-1]
206            elif token == 'def':
207                lineno, thisindent = start
208                # Close previous nested classes and defs.
209                while stack and stack[-1][1] >= thisindent:
210                    del stack[-1]
211                tokentype, func_name, start = next(g)[0:3]
212                if tokentype != NAME:
213                    continue  # Skip def with syntax error.
214                cur_func = None
215                if stack:
216                    cur_obj = stack[-1][0]
217                    cur_func = _nest_function(cur_obj, func_name, lineno)
218                else:
219                    # It is just a function.
220                    cur_func = Function(fullmodule, func_name, fname, lineno)
221                    tree[func_name] = cur_func
222                stack.append((cur_func, thisindent))
223            elif token == 'class':
224                lineno, thisindent = start
225                # Close previous nested classes and defs.
226                while stack and stack[-1][1] >= thisindent:
227                    del stack[-1]
228                tokentype, class_name, start = next(g)[0:3]
229                if tokentype != NAME:
230                    continue # Skip class with syntax error.
231                # Parse what follows the class name.
232                tokentype, token, start = next(g)[0:3]
233                inherit = None
234                if token == '(':
235                    names = [] # Initialize list of superclasses.
236                    level = 1
237                    super = [] # Tokens making up current superclass.
238                    while True:
239                        tokentype, token, start = next(g)[0:3]
240                        if token in (')', ',') and level == 1:
241                            n = "".join(super)
242                            if n in tree:
243                                # We know this super class.
244                                n = tree[n]
245                            else:
246                                c = n.split('.')
247                                if len(c) > 1:
248                                    # Super class form is module.class:
249                                    # look in module for class.
250                                    m = c[-2]
251                                    c = c[-1]
252                                    if m in _modules:
253                                        d = _modules[m]
254                                        if c in d:
255                                            n = d[c]
256                            names.append(n)
257                            super = []
258                        if token == '(':
259                            level += 1
260                        elif token == ')':
261                            level -= 1
262                            if level == 0:
263                                break
264                        elif token == ',' and level == 1:
265                            pass
266                        # Only use NAME and OP (== dot) tokens for type name.
267                        elif tokentype in (NAME, OP) and level == 1:
268                            super.append(token)
269                        # Expressions in the base list are not supported.
270                    inherit = names
271                if stack:
272                    cur_obj = stack[-1][0]
273                    cur_class = _nest_class(
274                            cur_obj, class_name, lineno, inherit)
275                else:
276                    cur_class = Class(fullmodule, class_name, inherit,
277                                      fname, lineno)
278                    tree[class_name] = cur_class
279                stack.append((cur_class, thisindent))
280            elif token == 'import' and start[1] == 0:
281                modules = _getnamelist(g)
282                for mod, _mod2 in modules:
283                    try:
284                        # Recursively read the imported module.
285                        if inpackage is None:
286                            _readmodule(mod, path)
287                        else:
288                            try:
289                                _readmodule(mod, path, inpackage)
290                            except ImportError:
291                                _readmodule(mod, [])
292                    except:
293                        # If we can't find or parse the imported module,
294                        # too bad -- don't die here.
295                        pass
296            elif token == 'from' and start[1] == 0:
297                mod, token = _getname(g)
298                if not mod or token != "import":
299                    continue
300                names = _getnamelist(g)
301                try:
302                    # Recursively read the imported module.
303                    d = _readmodule(mod, path, inpackage)
304                except:
305                    # If we can't find or parse the imported module,
306                    # too bad -- don't die here.
307                    continue
308                # Add any classes that were defined in the imported module
309                # to our name space if they were mentioned in the list.
310                for n, n2 in names:
311                    if n in d:
312                        tree[n2 or n] = d[n]
313                    elif n == '*':
314                        # Don't add names that start with _.
315                        for n in d:
316                            if n[0] != '_':
317                                tree[n] = d[n]
318    except StopIteration:
319        pass
320
321    f.close()
322    return tree
323
324
325def _getnamelist(g):
326    """Return list of (dotted-name, as-name or None) tuples for token source g.
327
328    An as-name is the name that follows 'as' in an as clause.
329    """
330    names = []
331    while True:
332        name, token = _getname(g)
333        if not name:
334            break
335        if token == 'as':
336            name2, token = _getname(g)
337        else:
338            name2 = None
339        names.append((name, name2))
340        while token != "," and "\n" not in token:
341            token = next(g)[1]
342        if token != ",":
343            break
344    return names
345
346
347def _getname(g):
348    "Return (dotted-name or None, next-token) tuple for token source g."
349    parts = []
350    tokentype, token = next(g)[0:2]
351    if tokentype != NAME and token != '*':
352        return (None, token)
353    parts.append(token)
354    while True:
355        tokentype, token = next(g)[0:2]
356        if token != '.':
357            break
358        tokentype, token = next(g)[0:2]
359        if tokentype != NAME:
360            break
361        parts.append(token)
362    return (".".join(parts), token)
363
364
365def _main():
366    "Print module output (default this file) for quick visual check."
367    import os
368    try:
369        mod = sys.argv[1]
370    except:
371        mod = __file__
372    if os.path.exists(mod):
373        path = [os.path.dirname(mod)]
374        mod = os.path.basename(mod)
375        if mod.lower().endswith(".py"):
376            mod = mod[:-3]
377    else:
378        path = []
379    tree = readmodule_ex(mod, path)
380    lineno_key = lambda a: getattr(a, 'lineno', 0)
381    objs = sorted(tree.values(), key=lineno_key, reverse=True)
382    indent_level = 2
383    while objs:
384        obj = objs.pop()
385        if isinstance(obj, list):
386            # Value is a __path__ key.
387            continue
388        if not hasattr(obj, 'indent'):
389            obj.indent = 0
390
391        if isinstance(obj, _Object):
392            new_objs = sorted(obj.children.values(),
393                              key=lineno_key, reverse=True)
394            for ob in new_objs:
395                ob.indent = obj.indent + indent_level
396            objs.extend(new_objs)
397        if isinstance(obj, Class):
398            print("{}class {} {} {}"
399                  .format(' ' * obj.indent, obj.name, obj.super, obj.lineno))
400        elif isinstance(obj, Function):
401            print("{}def {} {}".format(' ' * obj.indent, obj.name, obj.lineno))
402
403if __name__ == "__main__":
404    _main()
405