1import xml.sax 2import xml.sax.handler 3 4START_ELEMENT = "START_ELEMENT" 5END_ELEMENT = "END_ELEMENT" 6COMMENT = "COMMENT" 7START_DOCUMENT = "START_DOCUMENT" 8END_DOCUMENT = "END_DOCUMENT" 9PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" 10IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" 11CHARACTERS = "CHARACTERS" 12 13class PullDOM(xml.sax.ContentHandler): 14 _locator = None 15 document = None 16 17 def __init__(self, documentFactory=None): 18 from xml.dom import XML_NAMESPACE 19 self.documentFactory = documentFactory 20 self.firstEvent = [None, None] 21 self.lastEvent = self.firstEvent 22 self.elementStack = [] 23 self.push = self.elementStack.append 24 try: 25 self.pop = self.elementStack.pop 26 except AttributeError: 27 # use class' pop instead 28 pass 29 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts 30 self._current_context = self._ns_contexts[-1] 31 self.pending_events = [] 32 33 def pop(self): 34 result = self.elementStack[-1] 35 del self.elementStack[-1] 36 return result 37 38 def setDocumentLocator(self, locator): 39 self._locator = locator 40 41 def startPrefixMapping(self, prefix, uri): 42 if not hasattr(self, '_xmlns_attrs'): 43 self._xmlns_attrs = [] 44 self._xmlns_attrs.append((prefix or 'xmlns', uri)) 45 self._ns_contexts.append(self._current_context.copy()) 46 self._current_context[uri] = prefix or None 47 48 def endPrefixMapping(self, prefix): 49 self._current_context = self._ns_contexts.pop() 50 51 def startElementNS(self, name, tagName , attrs): 52 # Retrieve xml namespace declaration attributes. 53 xmlns_uri = 'http://www.w3.org/2000/xmlns/' 54 xmlns_attrs = getattr(self, '_xmlns_attrs', None) 55 if xmlns_attrs is not None: 56 for aname, value in xmlns_attrs: 57 attrs._attrs[(xmlns_uri, aname)] = value 58 self._xmlns_attrs = [] 59 uri, localname = name 60 if uri: 61 # When using namespaces, the reader may or may not 62 # provide us with the original name. If not, create 63 # *a* valid tagName from the current context. 64 if tagName is None: 65 prefix = self._current_context[uri] 66 if prefix: 67 tagName = prefix + ":" + localname 68 else: 69 tagName = localname 70 if self.document: 71 node = self.document.createElementNS(uri, tagName) 72 else: 73 node = self.buildDocument(uri, tagName) 74 else: 75 # When the tagname is not prefixed, it just appears as 76 # localname 77 if self.document: 78 node = self.document.createElement(localname) 79 else: 80 node = self.buildDocument(None, localname) 81 82 for aname,value in attrs.items(): 83 a_uri, a_localname = aname 84 if a_uri == xmlns_uri: 85 if a_localname == 'xmlns': 86 qname = a_localname 87 else: 88 qname = 'xmlns:' + a_localname 89 attr = self.document.createAttributeNS(a_uri, qname) 90 node.setAttributeNodeNS(attr) 91 elif a_uri: 92 prefix = self._current_context[a_uri] 93 if prefix: 94 qname = prefix + ":" + a_localname 95 else: 96 qname = a_localname 97 attr = self.document.createAttributeNS(a_uri, qname) 98 node.setAttributeNodeNS(attr) 99 else: 100 attr = self.document.createAttribute(a_localname) 101 node.setAttributeNode(attr) 102 attr.value = value 103 104 self.lastEvent[1] = [(START_ELEMENT, node), None] 105 self.lastEvent = self.lastEvent[1] 106 self.push(node) 107 108 def endElementNS(self, name, tagName): 109 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 110 self.lastEvent = self.lastEvent[1] 111 112 def startElement(self, name, attrs): 113 if self.document: 114 node = self.document.createElement(name) 115 else: 116 node = self.buildDocument(None, name) 117 118 for aname,value in attrs.items(): 119 attr = self.document.createAttribute(aname) 120 attr.value = value 121 node.setAttributeNode(attr) 122 123 self.lastEvent[1] = [(START_ELEMENT, node), None] 124 self.lastEvent = self.lastEvent[1] 125 self.push(node) 126 127 def endElement(self, name): 128 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 129 self.lastEvent = self.lastEvent[1] 130 131 def comment(self, s): 132 if self.document: 133 node = self.document.createComment(s) 134 self.lastEvent[1] = [(COMMENT, node), None] 135 self.lastEvent = self.lastEvent[1] 136 else: 137 event = [(COMMENT, s), None] 138 self.pending_events.append(event) 139 140 def processingInstruction(self, target, data): 141 if self.document: 142 node = self.document.createProcessingInstruction(target, data) 143 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] 144 self.lastEvent = self.lastEvent[1] 145 else: 146 event = [(PROCESSING_INSTRUCTION, target, data), None] 147 self.pending_events.append(event) 148 149 def ignorableWhitespace(self, chars): 150 node = self.document.createTextNode(chars) 151 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] 152 self.lastEvent = self.lastEvent[1] 153 154 def characters(self, chars): 155 node = self.document.createTextNode(chars) 156 self.lastEvent[1] = [(CHARACTERS, node), None] 157 self.lastEvent = self.lastEvent[1] 158 159 def startDocument(self): 160 if self.documentFactory is None: 161 import xml.dom.minidom 162 self.documentFactory = xml.dom.minidom.Document.implementation 163 164 def buildDocument(self, uri, tagname): 165 # Can't do that in startDocument, since we need the tagname 166 # XXX: obtain DocumentType 167 node = self.documentFactory.createDocument(uri, tagname, None) 168 self.document = node 169 self.lastEvent[1] = [(START_DOCUMENT, node), None] 170 self.lastEvent = self.lastEvent[1] 171 self.push(node) 172 # Put everything we have seen so far into the document 173 for e in self.pending_events: 174 if e[0][0] == PROCESSING_INSTRUCTION: 175 _,target,data = e[0] 176 n = self.document.createProcessingInstruction(target, data) 177 e[0] = (PROCESSING_INSTRUCTION, n) 178 elif e[0][0] == COMMENT: 179 n = self.document.createComment(e[0][1]) 180 e[0] = (COMMENT, n) 181 else: 182 raise AssertionError("Unknown pending event ",e[0][0]) 183 self.lastEvent[1] = e 184 self.lastEvent = e 185 self.pending_events = None 186 return node.firstChild 187 188 def endDocument(self): 189 self.lastEvent[1] = [(END_DOCUMENT, self.document), None] 190 self.pop() 191 192 def clear(self): 193 "clear(): Explicitly release parsing structures" 194 self.document = None 195 196class ErrorHandler: 197 def warning(self, exception): 198 print(exception) 199 def error(self, exception): 200 raise exception 201 def fatalError(self, exception): 202 raise exception 203 204class DOMEventStream: 205 def __init__(self, stream, parser, bufsize): 206 self.stream = stream 207 self.parser = parser 208 self.bufsize = bufsize 209 if not hasattr(self.parser, 'feed'): 210 self.getEvent = self._slurp 211 self.reset() 212 213 def reset(self): 214 self.pulldom = PullDOM() 215 # This content handler relies on namespace support 216 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) 217 self.parser.setContentHandler(self.pulldom) 218 219 def __getitem__(self, pos): 220 import warnings 221 warnings.warn( 222 "DOMEventStream's __getitem__ method ignores 'pos' parameter. " 223 "Use iterator protocol instead.", 224 DeprecationWarning, 225 stacklevel=2 226 ) 227 rc = self.getEvent() 228 if rc: 229 return rc 230 raise IndexError 231 232 def __next__(self): 233 rc = self.getEvent() 234 if rc: 235 return rc 236 raise StopIteration 237 238 def __iter__(self): 239 return self 240 241 def expandNode(self, node): 242 event = self.getEvent() 243 parents = [node] 244 while event: 245 token, cur_node = event 246 if cur_node is node: 247 return 248 if token != END_ELEMENT: 249 parents[-1].appendChild(cur_node) 250 if token == START_ELEMENT: 251 parents.append(cur_node) 252 elif token == END_ELEMENT: 253 del parents[-1] 254 event = self.getEvent() 255 256 def getEvent(self): 257 # use IncrementalParser interface, so we get the desired 258 # pull effect 259 if not self.pulldom.firstEvent[1]: 260 self.pulldom.lastEvent = self.pulldom.firstEvent 261 while not self.pulldom.firstEvent[1]: 262 buf = self.stream.read(self.bufsize) 263 if not buf: 264 self.parser.close() 265 return None 266 self.parser.feed(buf) 267 rc = self.pulldom.firstEvent[1][0] 268 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 269 return rc 270 271 def _slurp(self): 272 """ Fallback replacement for getEvent() using the 273 standard SAX2 interface, which means we slurp the 274 SAX events into memory (no performance gain, but 275 we are compatible to all SAX parsers). 276 """ 277 self.parser.parse(self.stream) 278 self.getEvent = self._emit 279 return self._emit() 280 281 def _emit(self): 282 """ Fallback replacement for getEvent() that emits 283 the events that _slurp() read previously. 284 """ 285 rc = self.pulldom.firstEvent[1][0] 286 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 287 return rc 288 289 def clear(self): 290 """clear(): Explicitly release parsing objects""" 291 self.pulldom.clear() 292 del self.pulldom 293 self.parser = None 294 self.stream = None 295 296class SAX2DOM(PullDOM): 297 298 def startElementNS(self, name, tagName , attrs): 299 PullDOM.startElementNS(self, name, tagName, attrs) 300 curNode = self.elementStack[-1] 301 parentNode = self.elementStack[-2] 302 parentNode.appendChild(curNode) 303 304 def startElement(self, name, attrs): 305 PullDOM.startElement(self, name, attrs) 306 curNode = self.elementStack[-1] 307 parentNode = self.elementStack[-2] 308 parentNode.appendChild(curNode) 309 310 def processingInstruction(self, target, data): 311 PullDOM.processingInstruction(self, target, data) 312 node = self.lastEvent[0][1] 313 parentNode = self.elementStack[-1] 314 parentNode.appendChild(node) 315 316 def ignorableWhitespace(self, chars): 317 PullDOM.ignorableWhitespace(self, chars) 318 node = self.lastEvent[0][1] 319 parentNode = self.elementStack[-1] 320 parentNode.appendChild(node) 321 322 def characters(self, chars): 323 PullDOM.characters(self, chars) 324 node = self.lastEvent[0][1] 325 parentNode = self.elementStack[-1] 326 parentNode.appendChild(node) 327 328 329default_bufsize = (2 ** 14) - 20 330 331def parse(stream_or_string, parser=None, bufsize=None): 332 if bufsize is None: 333 bufsize = default_bufsize 334 if isinstance(stream_or_string, str): 335 stream = open(stream_or_string, 'rb') 336 else: 337 stream = stream_or_string 338 if not parser: 339 parser = xml.sax.make_parser() 340 return DOMEventStream(stream, parser, bufsize) 341 342def parseString(string, parser=None): 343 from io import StringIO 344 345 bufsize = len(string) 346 buf = StringIO(string) 347 if not parser: 348 parser = xml.sax.make_parser() 349 return DOMEventStream(buf, parser, bufsize) 350