• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A collection of modules for iterating through different kinds of
2tree, generating tokens identical to those produced by the tokenizer
3module.
4
5To create a tree walker for a new type of tree, you need to do
6implement a tree walker object (called TreeWalker by convention) that
7implements a 'serialize' method taking a tree as sole argument and
8returning an iterator generating tokens.
9"""
10
11from __future__ import absolute_import, division, unicode_literals
12
13__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
14           "pulldom"]
15
16import sys
17
18from .. import constants
19from ..utils import default_etree
20
21treeWalkerCache = {}
22
23
24def getTreeWalker(treeType, implementation=None, **kwargs):
25    """Get a TreeWalker class for various types of tree with built-in support
26
27    treeType - the name of the tree type required (case-insensitive). Supported
28               values are:
29
30                "dom" - The xml.dom.minidom DOM implementation
31                "pulldom" - The xml.dom.pulldom event stream
32                "etree" - A generic walker for tree implementations exposing an
33                          elementtree-like interface (known to work with
34                          ElementTree, cElementTree and lxml.etree).
35                "lxml" - Optimized walker for lxml.etree
36                "genshi" - a Genshi stream
37
38    implementation - (Currently applies to the "etree" tree type only). A module
39                      implementing the tree type e.g. xml.etree.ElementTree or
40                      cElementTree."""
41
42    treeType = treeType.lower()
43    if treeType not in treeWalkerCache:
44        if treeType in ("dom", "pulldom"):
45            name = "%s.%s" % (__name__, treeType)
46            __import__(name)
47            mod = sys.modules[name]
48            treeWalkerCache[treeType] = mod.TreeWalker
49        elif treeType == "genshi":
50            from . import genshistream
51            treeWalkerCache[treeType] = genshistream.TreeWalker
52        elif treeType == "lxml":
53            from . import lxmletree
54            treeWalkerCache[treeType] = lxmletree.TreeWalker
55        elif treeType == "etree":
56            from . import etree
57            if implementation is None:
58                implementation = default_etree
59            # XXX: NEVER cache here, caching is done in the etree submodule
60            return etree.getETreeModule(implementation, **kwargs).TreeWalker
61    return treeWalkerCache.get(treeType)
62
63
64def concatenateCharacterTokens(tokens):
65    pendingCharacters = []
66    for token in tokens:
67        type = token["type"]
68        if type in ("Characters", "SpaceCharacters"):
69            pendingCharacters.append(token["data"])
70        else:
71            if pendingCharacters:
72                yield {"type": "Characters", "data": "".join(pendingCharacters)}
73                pendingCharacters = []
74            yield token
75    if pendingCharacters:
76        yield {"type": "Characters", "data": "".join(pendingCharacters)}
77
78
79def pprint(walker):
80    """Pretty printer for tree walkers"""
81    output = []
82    indent = 0
83    for token in concatenateCharacterTokens(walker):
84        type = token["type"]
85        if type in ("StartTag", "EmptyTag"):
86            # tag name
87            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
88                if token["namespace"] in constants.prefixes:
89                    ns = constants.prefixes[token["namespace"]]
90                else:
91                    ns = token["namespace"]
92                name = "%s %s" % (ns, token["name"])
93            else:
94                name = token["name"]
95            output.append("%s<%s>" % (" " * indent, name))
96            indent += 2
97            # attributes (sorted for consistent ordering)
98            attrs = token["data"]
99            for (namespace, localname), value in sorted(attrs.items()):
100                if namespace:
101                    if namespace in constants.prefixes:
102                        ns = constants.prefixes[namespace]
103                    else:
104                        ns = namespace
105                    name = "%s %s" % (ns, localname)
106                else:
107                    name = localname
108                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
109            # self-closing
110            if type == "EmptyTag":
111                indent -= 2
112
113        elif type == "EndTag":
114            indent -= 2
115
116        elif type == "Comment":
117            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
118
119        elif type == "Doctype":
120            if token["name"]:
121                if token["publicId"]:
122                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
123                                  (" " * indent,
124                                   token["name"],
125                                   token["publicId"],
126                                   token["systemId"] if token["systemId"] else ""))
127                elif token["systemId"]:
128                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
129                                  (" " * indent,
130                                   token["name"],
131                                   token["systemId"]))
132                else:
133                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
134                                                       token["name"]))
135            else:
136                output.append("%s<!DOCTYPE >" % (" " * indent,))
137
138        elif type == "Characters":
139            output.append("%s\"%s\"" % (" " * indent, token["data"]))
140
141        elif type == "SpaceCharacters":
142            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
143
144        else:
145            raise ValueError("Unknown token type, %s" % type)
146
147    return "\n".join(output)
148