• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import absolute_import, division, unicode_literals
2from six import text_type, string_types
3
4__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
5           "TreeWalker", "NonRecursiveTreeWalker"]
6
7from xml.dom import Node
8
9DOCUMENT = Node.DOCUMENT_NODE
10DOCTYPE = Node.DOCUMENT_TYPE_NODE
11TEXT = Node.TEXT_NODE
12ELEMENT = Node.ELEMENT_NODE
13COMMENT = Node.COMMENT_NODE
14ENTITY = Node.ENTITY_NODE
15UNKNOWN = "<#UNKNOWN#>"
16
17from ..constants import voidElements, spaceCharacters
18spaceCharacters = "".join(spaceCharacters)
19
20
21def to_text(s, blank_if_none=True):
22    """Wrapper around six.text_type to convert None to empty string"""
23    if s is None:
24        if blank_if_none:
25            return ""
26        else:
27            return None
28    elif isinstance(s, text_type):
29        return s
30    else:
31        return text_type(s)
32
33
34def is_text_or_none(string):
35    """Wrapper around isinstance(string_types) or is None"""
36    return string is None or isinstance(string, string_types)
37
38
39class TreeWalker(object):
40    def __init__(self, tree):
41        self.tree = tree
42
43    def __iter__(self):
44        raise NotImplementedError
45
46    def error(self, msg):
47        return {"type": "SerializeError", "data": msg}
48
49    def emptyTag(self, namespace, name, attrs, hasChildren=False):
50        assert namespace is None or isinstance(namespace, string_types), type(namespace)
51        assert isinstance(name, string_types), type(name)
52        assert all((namespace is None or isinstance(namespace, string_types)) and
53                   isinstance(name, string_types) and
54                   isinstance(value, string_types)
55                   for (namespace, name), value in attrs.items())
56
57        yield {"type": "EmptyTag", "name": to_text(name, False),
58               "namespace": to_text(namespace),
59               "data": attrs}
60        if hasChildren:
61            yield self.error("Void element has children")
62
63    def startTag(self, namespace, name, attrs):
64        assert namespace is None or isinstance(namespace, string_types), type(namespace)
65        assert isinstance(name, string_types), type(name)
66        assert all((namespace is None or isinstance(namespace, string_types)) and
67                   isinstance(name, string_types) and
68                   isinstance(value, string_types)
69                   for (namespace, name), value in attrs.items())
70
71        return {"type": "StartTag",
72                "name": text_type(name),
73                "namespace": to_text(namespace),
74                "data": dict(((to_text(namespace, False), to_text(name)),
75                              to_text(value, False))
76                             for (namespace, name), value in attrs.items())}
77
78    def endTag(self, namespace, name):
79        assert namespace is None or isinstance(namespace, string_types), type(namespace)
80        assert isinstance(name, string_types), type(namespace)
81
82        return {"type": "EndTag",
83                "name": to_text(name, False),
84                "namespace": to_text(namespace),
85                "data": {}}
86
87    def text(self, data):
88        assert isinstance(data, string_types), type(data)
89
90        data = to_text(data)
91        middle = data.lstrip(spaceCharacters)
92        left = data[:len(data) - len(middle)]
93        if left:
94            yield {"type": "SpaceCharacters", "data": left}
95        data = middle
96        middle = data.rstrip(spaceCharacters)
97        right = data[len(middle):]
98        if middle:
99            yield {"type": "Characters", "data": middle}
100        if right:
101            yield {"type": "SpaceCharacters", "data": right}
102
103    def comment(self, data):
104        assert isinstance(data, string_types), type(data)
105
106        return {"type": "Comment", "data": text_type(data)}
107
108    def doctype(self, name, publicId=None, systemId=None, correct=True):
109        assert is_text_or_none(name), type(name)
110        assert is_text_or_none(publicId), type(publicId)
111        assert is_text_or_none(systemId), type(systemId)
112
113        return {"type": "Doctype",
114                "name": to_text(name),
115                "publicId": to_text(publicId),
116                "systemId": to_text(systemId),
117                "correct": to_text(correct)}
118
119    def entity(self, name):
120        assert isinstance(name, string_types), type(name)
121
122        return {"type": "Entity", "name": text_type(name)}
123
124    def unknown(self, nodeType):
125        return self.error("Unknown node type: " + nodeType)
126
127
128class NonRecursiveTreeWalker(TreeWalker):
129    def getNodeDetails(self, node):
130        raise NotImplementedError
131
132    def getFirstChild(self, node):
133        raise NotImplementedError
134
135    def getNextSibling(self, node):
136        raise NotImplementedError
137
138    def getParentNode(self, node):
139        raise NotImplementedError
140
141    def __iter__(self):
142        currentNode = self.tree
143        while currentNode is not None:
144            details = self.getNodeDetails(currentNode)
145            type, details = details[0], details[1:]
146            hasChildren = False
147
148            if type == DOCTYPE:
149                yield self.doctype(*details)
150
151            elif type == TEXT:
152                for token in self.text(*details):
153                    yield token
154
155            elif type == ELEMENT:
156                namespace, name, attributes, hasChildren = details
157                if name in voidElements:
158                    for token in self.emptyTag(namespace, name, attributes,
159                                               hasChildren):
160                        yield token
161                    hasChildren = False
162                else:
163                    yield self.startTag(namespace, name, attributes)
164
165            elif type == COMMENT:
166                yield self.comment(details[0])
167
168            elif type == ENTITY:
169                yield self.entity(details[0])
170
171            elif type == DOCUMENT:
172                hasChildren = True
173
174            else:
175                yield self.unknown(details[0])
176
177            if hasChildren:
178                firstChild = self.getFirstChild(currentNode)
179            else:
180                firstChild = None
181
182            if firstChild is not None:
183                currentNode = firstChild
184            else:
185                while currentNode is not None:
186                    details = self.getNodeDetails(currentNode)
187                    type, details = details[0], details[1:]
188                    if type == ELEMENT:
189                        namespace, name, attributes, hasChildren = details
190                        if name not in voidElements:
191                            yield self.endTag(namespace, name)
192                    if self.tree is currentNode:
193                        currentNode = None
194                        break
195                    nextSibling = self.getNextSibling(currentNode)
196                    if nextSibling is not None:
197                        currentNode = nextSibling
198                        break
199                    else:
200                        currentNode = self.getParentNode(currentNode)
201