1from __future__ import absolute_import, division, unicode_literals 2from six import text_type, string_types 3 4__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", 5 "TreeWalker", "NonRecursiveTreeWalker"] 6 7from xml.dom import Node 8 9DOCUMENT = Node.DOCUMENT_NODE 10DOCTYPE = Node.DOCUMENT_TYPE_NODE 11TEXT = Node.TEXT_NODE 12ELEMENT = Node.ELEMENT_NODE 13COMMENT = Node.COMMENT_NODE 14ENTITY = Node.ENTITY_NODE 15UNKNOWN = "<#UNKNOWN#>" 16 17from ..constants import voidElements, spaceCharacters 18spaceCharacters = "".join(spaceCharacters) 19 20 21def to_text(s, blank_if_none=True): 22 """Wrapper around six.text_type to convert None to empty string""" 23 if s is None: 24 if blank_if_none: 25 return "" 26 else: 27 return None 28 elif isinstance(s, text_type): 29 return s 30 else: 31 return text_type(s) 32 33 34def is_text_or_none(string): 35 """Wrapper around isinstance(string_types) or is None""" 36 return string is None or isinstance(string, string_types) 37 38 39class TreeWalker(object): 40 def __init__(self, tree): 41 self.tree = tree 42 43 def __iter__(self): 44 raise NotImplementedError 45 46 def error(self, msg): 47 return {"type": "SerializeError", "data": msg} 48 49 def emptyTag(self, namespace, name, attrs, hasChildren=False): 50 assert namespace is None or isinstance(namespace, string_types), type(namespace) 51 assert isinstance(name, string_types), type(name) 52 assert all((namespace is None or isinstance(namespace, string_types)) and 53 isinstance(name, string_types) and 54 isinstance(value, string_types) 55 for (namespace, name), value in attrs.items()) 56 57 yield {"type": "EmptyTag", "name": to_text(name, False), 58 "namespace": to_text(namespace), 59 "data": attrs} 60 if hasChildren: 61 yield self.error("Void element has children") 62 63 def startTag(self, namespace, name, attrs): 64 assert namespace is None or isinstance(namespace, string_types), type(namespace) 65 assert isinstance(name, string_types), type(name) 66 assert all((namespace is None or isinstance(namespace, string_types)) and 67 isinstance(name, string_types) and 68 isinstance(value, string_types) 69 for (namespace, name), value in attrs.items()) 70 71 return {"type": "StartTag", 72 "name": text_type(name), 73 "namespace": to_text(namespace), 74 "data": dict(((to_text(namespace, False), to_text(name)), 75 to_text(value, False)) 76 for (namespace, name), value in attrs.items())} 77 78 def endTag(self, namespace, name): 79 assert namespace is None or isinstance(namespace, string_types), type(namespace) 80 assert isinstance(name, string_types), type(namespace) 81 82 return {"type": "EndTag", 83 "name": to_text(name, False), 84 "namespace": to_text(namespace), 85 "data": {}} 86 87 def text(self, data): 88 assert isinstance(data, string_types), type(data) 89 90 data = to_text(data) 91 middle = data.lstrip(spaceCharacters) 92 left = data[:len(data) - len(middle)] 93 if left: 94 yield {"type": "SpaceCharacters", "data": left} 95 data = middle 96 middle = data.rstrip(spaceCharacters) 97 right = data[len(middle):] 98 if middle: 99 yield {"type": "Characters", "data": middle} 100 if right: 101 yield {"type": "SpaceCharacters", "data": right} 102 103 def comment(self, data): 104 assert isinstance(data, string_types), type(data) 105 106 return {"type": "Comment", "data": text_type(data)} 107 108 def doctype(self, name, publicId=None, systemId=None, correct=True): 109 assert is_text_or_none(name), type(name) 110 assert is_text_or_none(publicId), type(publicId) 111 assert is_text_or_none(systemId), type(systemId) 112 113 return {"type": "Doctype", 114 "name": to_text(name), 115 "publicId": to_text(publicId), 116 "systemId": to_text(systemId), 117 "correct": to_text(correct)} 118 119 def entity(self, name): 120 assert isinstance(name, string_types), type(name) 121 122 return {"type": "Entity", "name": text_type(name)} 123 124 def unknown(self, nodeType): 125 return self.error("Unknown node type: " + nodeType) 126 127 128class NonRecursiveTreeWalker(TreeWalker): 129 def getNodeDetails(self, node): 130 raise NotImplementedError 131 132 def getFirstChild(self, node): 133 raise NotImplementedError 134 135 def getNextSibling(self, node): 136 raise NotImplementedError 137 138 def getParentNode(self, node): 139 raise NotImplementedError 140 141 def __iter__(self): 142 currentNode = self.tree 143 while currentNode is not None: 144 details = self.getNodeDetails(currentNode) 145 type, details = details[0], details[1:] 146 hasChildren = False 147 148 if type == DOCTYPE: 149 yield self.doctype(*details) 150 151 elif type == TEXT: 152 for token in self.text(*details): 153 yield token 154 155 elif type == ELEMENT: 156 namespace, name, attributes, hasChildren = details 157 if name in voidElements: 158 for token in self.emptyTag(namespace, name, attributes, 159 hasChildren): 160 yield token 161 hasChildren = False 162 else: 163 yield self.startTag(namespace, name, attributes) 164 165 elif type == COMMENT: 166 yield self.comment(details[0]) 167 168 elif type == ENTITY: 169 yield self.entity(details[0]) 170 171 elif type == DOCUMENT: 172 hasChildren = True 173 174 else: 175 yield self.unknown(details[0]) 176 177 if hasChildren: 178 firstChild = self.getFirstChild(currentNode) 179 else: 180 firstChild = None 181 182 if firstChild is not None: 183 currentNode = firstChild 184 else: 185 while currentNode is not None: 186 details = self.getNodeDetails(currentNode) 187 type, details = details[0], details[1:] 188 if type == ELEMENT: 189 namespace, name, attributes, hasChildren = details 190 if name not in voidElements: 191 yield self.endTag(namespace, name) 192 if self.tree is currentNode: 193 currentNode = None 194 break 195 nextSibling = self.getNextSibling(currentNode) 196 if nextSibling is not None: 197 currentNode = nextSibling 198 break 199 else: 200 currentNode = self.getParentNode(currentNode) 201