• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse a Python module and describe its classes and functions.
2
3Parse enough of a Python file to recognize imports and class and
4function definitions, and to find out the superclasses of a class.
5
6The interface consists of a single function:
7    readmodule_ex(module, path=None)
8where module is the name of a Python module, and path is an optional
9list of directories where the module is to be searched.  If present,
10path is prepended to the system search path sys.path.  The return value
11is a dictionary.  The keys of the dictionary are the names of the
12classes and functions defined in the module (including classes that are
13defined via the from XXX import YYY construct).  The values are
14instances of classes Class and Function.  One special key/value pair is
15present for packages: the key '__path__' has a list as its value which
16contains the package search path.
17
18Classes and Functions have a common superclass: _Object.  Every instance
19has the following attributes:
20    module  -- name of the module;
21    name    -- name of the object;
22    file    -- file in which the object is defined;
23    lineno  -- line in the file where the object's definition starts;
24    parent  -- parent of this object, if any;
25    children -- nested objects contained in this object.
26The 'children' attribute is a dictionary mapping names to objects.
27
28Instances of Function describe functions with the attributes from _Object.
29
30Instances of Class describe classes with the attributes from _Object,
31plus the following:
32    super   -- list of super classes (Class instances if possible);
33    methods -- mapping of method names to beginning line numbers.
34If the name of a super class is not recognized, the corresponding
35entry in the list of super classes is not a class instance but a
36string giving the name of the super class.  Since import statements
37are recognized and imported modules are scanned as well, this
38shouldn't happen often.
39"""
40
41import io
42import sys
43import importlib.util
44import tokenize
45from token import NAME, DEDENT, OP
46
47__all__ = ["readmodule", "readmodule_ex", "Class", "Function"]
48
49_modules = {}  # Initialize cache of modules we've seen.
50
51
52class _Object:
53    "Informaton about Python class or function."
54    def __init__(self, module, name, file, lineno, parent):
55        self.module = module
56        self.name = name
57        self.file = file
58        self.lineno = lineno
59        self.parent = parent
60        self.children = {}
61
62    def _addchild(self, name, obj):
63        self.children[name] = obj
64
65
66class Function(_Object):
67    "Information about a Python function, including methods."
68    def __init__(self, module, name, file, lineno, parent=None):
69        _Object.__init__(self, module, name, file, lineno, parent)
70
71
72class Class(_Object):
73    "Information about a Python class."
74    def __init__(self, module, name, super, file, lineno, parent=None):
75        _Object.__init__(self, module, name, file, lineno, parent)
76        self.super = [] if super is None else super
77        self.methods = {}
78
79    def _addmethod(self, name, lineno):
80        self.methods[name] = lineno
81
82
83def _nest_function(ob, func_name, lineno):
84    "Return a Function after nesting within ob."
85    newfunc = Function(ob.module, func_name, ob.file, lineno, ob)
86    ob._addchild(func_name, newfunc)
87    if isinstance(ob, Class):
88        ob._addmethod(func_name, lineno)
89    return newfunc
90
91def _nest_class(ob, class_name, lineno, super=None):
92    "Return a Class after nesting within ob."
93    newclass = Class(ob.module, class_name, super, ob.file, lineno, ob)
94    ob._addchild(class_name, newclass)
95    return newclass
96
97def readmodule(module, path=None):
98    """Return Class objects for the top-level classes in module.
99
100    This is the original interface, before Functions were added.
101    """
102
103    res = {}
104    for key, value in _readmodule(module, path or []).items():
105        if isinstance(value, Class):
106            res[key] = value
107    return res
108
109def readmodule_ex(module, path=None):
110    """Return a dictionary with all functions and classes in module.
111
112    Search for module in PATH + sys.path.
113    If possible, include imported superclasses.
114    Do this by reading source, without importing (and executing) it.
115    """
116    return _readmodule(module, path or [])
117
118def _readmodule(module, path, inpackage=None):
119    """Do the hard work for readmodule[_ex].
120
121    If inpackage is given, it must be the dotted name of the package in
122    which we are searching for a submodule, and then PATH must be the
123    package search path; otherwise, we are searching for a top-level
124    module, and path is combined with sys.path.
125    """
126    # Compute the full module name (prepending inpackage if set).
127    if inpackage is not None:
128        fullmodule = "%s.%s" % (inpackage, module)
129    else:
130        fullmodule = module
131
132    # Check in the cache.
133    if fullmodule in _modules:
134        return _modules[fullmodule]
135
136    # Initialize the dict for this module's contents.
137    tree = {}
138
139    # Check if it is a built-in module; we don't do much for these.
140    if module in sys.builtin_module_names and inpackage is None:
141        _modules[module] = tree
142        return tree
143
144    # Check for a dotted module name.
145    i = module.rfind('.')
146    if i >= 0:
147        package = module[:i]
148        submodule = module[i+1:]
149        parent = _readmodule(package, path, inpackage)
150        if inpackage is not None:
151            package = "%s.%s" % (inpackage, package)
152        if not '__path__' in parent:
153            raise ImportError('No package named {}'.format(package))
154        return _readmodule(submodule, parent['__path__'], package)
155
156    # Search the path for the module.
157    f = None
158    if inpackage is not None:
159        search_path = path
160    else:
161        search_path = path + sys.path
162    spec = importlib.util._find_spec_from_path(fullmodule, search_path)
163    _modules[fullmodule] = tree
164    # Is module a package?
165    if spec.submodule_search_locations is not None:
166        tree['__path__'] = spec.submodule_search_locations
167    try:
168        source = spec.loader.get_source(fullmodule)
169        if source is None:
170            return tree
171    except (AttributeError, ImportError):
172        # If module is not Python source, we cannot do anything.
173        return tree
174
175    fname = spec.loader.get_filename(fullmodule)
176    return _create_tree(fullmodule, path, fname, source, tree, inpackage)
177
178
179def _create_tree(fullmodule, path, fname, source, tree, inpackage):
180    """Return the tree for a particular module.
181
182    fullmodule (full module name), inpackage+module, becomes o.module.
183    path is passed to recursive calls of _readmodule.
184    fname becomes o.file.
185    source is tokenized.  Imports cause recursive calls to _readmodule.
186    tree is {} or {'__path__': <submodule search locations>}.
187    inpackage, None or string, is passed to recursive calls of _readmodule.
188
189    The effect of recursive calls is mutation of global _modules.
190    """
191    f = io.StringIO(source)
192
193    stack = [] # Initialize stack of (class, indent) pairs.
194
195    g = tokenize.generate_tokens(f.readline)
196    try:
197        for tokentype, token, start, _end, _line in g:
198            if tokentype == DEDENT:
199                lineno, thisindent = start
200                # Close previous nested classes and defs.
201                while stack and stack[-1][1] >= thisindent:
202                    del stack[-1]
203            elif token == 'def':
204                lineno, thisindent = start
205                # Close previous nested classes and defs.
206                while stack and stack[-1][1] >= thisindent:
207                    del stack[-1]
208                tokentype, func_name, start = next(g)[0:3]
209                if tokentype != NAME:
210                    continue  # Skip def with syntax error.
211                cur_func = None
212                if stack:
213                    cur_obj = stack[-1][0]
214                    cur_func = _nest_function(cur_obj, func_name, lineno)
215                else:
216                    # It is just a function.
217                    cur_func = Function(fullmodule, func_name, fname, lineno)
218                    tree[func_name] = cur_func
219                stack.append((cur_func, thisindent))
220            elif token == 'class':
221                lineno, thisindent = start
222                # Close previous nested classes and defs.
223                while stack and stack[-1][1] >= thisindent:
224                    del stack[-1]
225                tokentype, class_name, start = next(g)[0:3]
226                if tokentype != NAME:
227                    continue # Skip class with syntax error.
228                # Parse what follows the class name.
229                tokentype, token, start = next(g)[0:3]
230                inherit = None
231                if token == '(':
232                    names = [] # Initialize list of superclasses.
233                    level = 1
234                    super = [] # Tokens making up current superclass.
235                    while True:
236                        tokentype, token, start = next(g)[0:3]
237                        if token in (')', ',') and level == 1:
238                            n = "".join(super)
239                            if n in tree:
240                                # We know this super class.
241                                n = tree[n]
242                            else:
243                                c = n.split('.')
244                                if len(c) > 1:
245                                    # Super class form is module.class:
246                                    # look in module for class.
247                                    m = c[-2]
248                                    c = c[-1]
249                                    if m in _modules:
250                                        d = _modules[m]
251                                        if c in d:
252                                            n = d[c]
253                            names.append(n)
254                            super = []
255                        if token == '(':
256                            level += 1
257                        elif token == ')':
258                            level -= 1
259                            if level == 0:
260                                break
261                        elif token == ',' and level == 1:
262                            pass
263                        # Only use NAME and OP (== dot) tokens for type name.
264                        elif tokentype in (NAME, OP) and level == 1:
265                            super.append(token)
266                        # Expressions in the base list are not supported.
267                    inherit = names
268                if stack:
269                    cur_obj = stack[-1][0]
270                    cur_class = _nest_class(
271                            cur_obj, class_name, lineno, inherit)
272                else:
273                    cur_class = Class(fullmodule, class_name, inherit,
274                                      fname, lineno)
275                    tree[class_name] = cur_class
276                stack.append((cur_class, thisindent))
277            elif token == 'import' and start[1] == 0:
278                modules = _getnamelist(g)
279                for mod, _mod2 in modules:
280                    try:
281                        # Recursively read the imported module.
282                        if inpackage is None:
283                            _readmodule(mod, path)
284                        else:
285                            try:
286                                _readmodule(mod, path, inpackage)
287                            except ImportError:
288                                _readmodule(mod, [])
289                    except:
290                        # If we can't find or parse the imported module,
291                        # too bad -- don't die here.
292                        pass
293            elif token == 'from' and start[1] == 0:
294                mod, token = _getname(g)
295                if not mod or token != "import":
296                    continue
297                names = _getnamelist(g)
298                try:
299                    # Recursively read the imported module.
300                    d = _readmodule(mod, path, inpackage)
301                except:
302                    # If we can't find or parse the imported module,
303                    # too bad -- don't die here.
304                    continue
305                # Add any classes that were defined in the imported module
306                # to our name space if they were mentioned in the list.
307                for n, n2 in names:
308                    if n in d:
309                        tree[n2 or n] = d[n]
310                    elif n == '*':
311                        # Don't add names that start with _.
312                        for n in d:
313                            if n[0] != '_':
314                                tree[n] = d[n]
315    except StopIteration:
316        pass
317
318    f.close()
319    return tree
320
321
322def _getnamelist(g):
323    """Return list of (dotted-name, as-name or None) tuples for token source g.
324
325    An as-name is the name that follows 'as' in an as clause.
326    """
327    names = []
328    while True:
329        name, token = _getname(g)
330        if not name:
331            break
332        if token == 'as':
333            name2, token = _getname(g)
334        else:
335            name2 = None
336        names.append((name, name2))
337        while token != "," and "\n" not in token:
338            token = next(g)[1]
339        if token != ",":
340            break
341    return names
342
343
344def _getname(g):
345    "Return (dotted-name or None, next-token) tuple for token source g."
346    parts = []
347    tokentype, token = next(g)[0:2]
348    if tokentype != NAME and token != '*':
349        return (None, token)
350    parts.append(token)
351    while True:
352        tokentype, token = next(g)[0:2]
353        if token != '.':
354            break
355        tokentype, token = next(g)[0:2]
356        if tokentype != NAME:
357            break
358        parts.append(token)
359    return (".".join(parts), token)
360
361
362def _main():
363    "Print module output (default this file) for quick visual check."
364    import os
365    try:
366        mod = sys.argv[1]
367    except:
368        mod = __file__
369    if os.path.exists(mod):
370        path = [os.path.dirname(mod)]
371        mod = os.path.basename(mod)
372        if mod.lower().endswith(".py"):
373            mod = mod[:-3]
374    else:
375        path = []
376    tree = readmodule_ex(mod, path)
377    lineno_key = lambda a: getattr(a, 'lineno', 0)
378    objs = sorted(tree.values(), key=lineno_key, reverse=True)
379    indent_level = 2
380    while objs:
381        obj = objs.pop()
382        if isinstance(obj, list):
383            # Value is a __path__ key.
384            continue
385        if not hasattr(obj, 'indent'):
386            obj.indent = 0
387
388        if isinstance(obj, _Object):
389            new_objs = sorted(obj.children.values(),
390                              key=lineno_key, reverse=True)
391            for ob in new_objs:
392                ob.indent = obj.indent + indent_level
393            objs.extend(new_objs)
394        if isinstance(obj, Class):
395            print("{}class {} {} {}"
396                  .format(' ' * obj.indent, obj.name, obj.super, obj.lineno))
397        elif isinstance(obj, Function):
398            print("{}def {} {}".format(' ' * obj.indent, obj.name, obj.lineno))
399
400if __name__ == "__main__":
401    _main()
402