1"""Facility to use the Expat parser to load a minidom instance 2from a string or file. 3 4This avoids all the overhead of SAX and pulldom to gain performance. 5""" 6 7# Warning! 8# 9# This module is tightly bound to the implementation details of the 10# minidom DOM and can't be used with other DOM implementations. This 11# is due, in part, to a lack of appropriate methods in the DOM (there is 12# no way to create Entity and Notation nodes via the DOM Level 2 13# interface), and for performance. The latter is the cause of some fairly 14# cryptic code. 15# 16# Performance hacks: 17# 18# - .character_data_handler() has an extra case in which continuing 19# data is appended to an existing Text node; this can be a 20# speedup since pyexpat can break up character data into multiple 21# callbacks even though we set the buffer_text attribute on the 22# parser. This also gives us the advantage that we don't need a 23# separate normalization pass. 24# 25# - Determining that a node exists is done using an identity comparison 26# with None rather than a truth test; this avoids searching for and 27# calling any methods on the node object if it exists. (A rather 28# nice speedup is achieved this way as well!) 29 30from xml.dom import xmlbuilder, minidom, Node 31from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE 32from xml.parsers import expat 33from xml.dom.minidom import _append_child, _set_attribute_node 34from xml.dom.NodeFilter import NodeFilter 35 36TEXT_NODE = Node.TEXT_NODE 37CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE 38DOCUMENT_NODE = Node.DOCUMENT_NODE 39 40FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT 41FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT 42FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP 43FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT 44 45theDOMImplementation = minidom.getDOMImplementation() 46 47# Expat typename -> TypeInfo 48_typeinfo_map = { 49 "CDATA": minidom.TypeInfo(None, "cdata"), 50 "ENUM": minidom.TypeInfo(None, "enumeration"), 51 "ENTITY": minidom.TypeInfo(None, "entity"), 52 "ENTITIES": minidom.TypeInfo(None, "entities"), 53 "ID": minidom.TypeInfo(None, "id"), 54 "IDREF": minidom.TypeInfo(None, "idref"), 55 "IDREFS": minidom.TypeInfo(None, "idrefs"), 56 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"), 57 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"), 58 } 59 60class ElementInfo(object): 61 __slots__ = '_attr_info', '_model', 'tagName' 62 63 def __init__(self, tagName, model=None): 64 self.tagName = tagName 65 self._attr_info = [] 66 self._model = model 67 68 def __getstate__(self): 69 return self._attr_info, self._model, self.tagName 70 71 def __setstate__(self, state): 72 self._attr_info, self._model, self.tagName = state 73 74 def getAttributeType(self, aname): 75 for info in self._attr_info: 76 if info[1] == aname: 77 t = info[-2] 78 if t[0] == "(": 79 return _typeinfo_map["ENUM"] 80 else: 81 return _typeinfo_map[info[-2]] 82 return minidom._no_type 83 84 def getAttributeTypeNS(self, namespaceURI, localName): 85 return minidom._no_type 86 87 def isElementContent(self): 88 if self._model: 89 type = self._model[0] 90 return type not in (expat.model.XML_CTYPE_ANY, 91 expat.model.XML_CTYPE_MIXED) 92 else: 93 return False 94 95 def isEmpty(self): 96 if self._model: 97 return self._model[0] == expat.model.XML_CTYPE_EMPTY 98 else: 99 return False 100 101 def isId(self, aname): 102 for info in self._attr_info: 103 if info[1] == aname: 104 return info[-2] == "ID" 105 return False 106 107 def isIdNS(self, euri, ename, auri, aname): 108 # not sure this is meaningful 109 return self.isId((auri, aname)) 110 111def _intern(builder, s): 112 return builder._intern_setdefault(s, s) 113 114def _parse_ns_name(builder, name): 115 assert ' ' in name 116 parts = name.split(' ') 117 intern = builder._intern_setdefault 118 if len(parts) == 3: 119 uri, localname, prefix = parts 120 prefix = intern(prefix, prefix) 121 qname = "%s:%s" % (prefix, localname) 122 qname = intern(qname, qname) 123 localname = intern(localname, localname) 124 elif len(parts) == 2: 125 uri, localname = parts 126 prefix = EMPTY_PREFIX 127 qname = localname = intern(localname, localname) 128 else: 129 raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name) 130 return intern(uri, uri), localname, prefix, qname 131 132 133class ExpatBuilder: 134 """Document builder that uses Expat to build a ParsedXML.DOM document 135 instance.""" 136 137 def __init__(self, options=None): 138 if options is None: 139 options = xmlbuilder.Options() 140 self._options = options 141 if self._options.filter is not None: 142 self._filter = FilterVisibilityController(self._options.filter) 143 else: 144 self._filter = None 145 # This *really* doesn't do anything in this case, so 146 # override it with something fast & minimal. 147 self._finish_start_element = id 148 self._parser = None 149 self.reset() 150 151 def createParser(self): 152 """Create a new parser object.""" 153 return expat.ParserCreate() 154 155 def getParser(self): 156 """Return the parser object, creating a new one if needed.""" 157 if not self._parser: 158 self._parser = self.createParser() 159 self._intern_setdefault = self._parser.intern.setdefault 160 self._parser.buffer_text = True 161 self._parser.ordered_attributes = True 162 self._parser.specified_attributes = True 163 self.install(self._parser) 164 return self._parser 165 166 def reset(self): 167 """Free all data structures used during DOM construction.""" 168 self.document = theDOMImplementation.createDocument( 169 EMPTY_NAMESPACE, None, None) 170 self.curNode = self.document 171 self._elem_info = self.document._elem_info 172 self._cdata = False 173 174 def install(self, parser): 175 """Install the callbacks needed to build the DOM into the parser.""" 176 # This creates circular references! 177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 178 parser.StartElementHandler = self.first_element_handler 179 parser.EndElementHandler = self.end_element_handler 180 parser.ProcessingInstructionHandler = self.pi_handler 181 if self._options.entities: 182 parser.EntityDeclHandler = self.entity_decl_handler 183 parser.NotationDeclHandler = self.notation_decl_handler 184 if self._options.comments: 185 parser.CommentHandler = self.comment_handler 186 if self._options.cdata_sections: 187 parser.StartCdataSectionHandler = self.start_cdata_section_handler 188 parser.EndCdataSectionHandler = self.end_cdata_section_handler 189 parser.CharacterDataHandler = self.character_data_handler_cdata 190 else: 191 parser.CharacterDataHandler = self.character_data_handler 192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler 193 parser.XmlDeclHandler = self.xml_decl_handler 194 parser.ElementDeclHandler = self.element_decl_handler 195 parser.AttlistDeclHandler = self.attlist_decl_handler 196 197 def parseFile(self, file): 198 """Parse a document from a file object, returning the document 199 node.""" 200 parser = self.getParser() 201 first_buffer = True 202 try: 203 while buffer := file.read(16*1024): 204 parser.Parse(buffer, False) 205 if first_buffer and self.document.documentElement: 206 self._setup_subset(buffer) 207 first_buffer = False 208 parser.Parse(b"", True) 209 except ParseEscape: 210 pass 211 doc = self.document 212 self.reset() 213 self._parser = None 214 return doc 215 216 def parseString(self, string): 217 """Parse a document from a string, returning the document node.""" 218 parser = self.getParser() 219 try: 220 parser.Parse(string, True) 221 self._setup_subset(string) 222 except ParseEscape: 223 pass 224 doc = self.document 225 self.reset() 226 self._parser = None 227 return doc 228 229 def _setup_subset(self, buffer): 230 """Load the internal subset if there might be one.""" 231 if self.document.doctype: 232 extractor = InternalSubsetExtractor() 233 extractor.parseString(buffer) 234 subset = extractor.getSubset() 235 self.document.doctype.internalSubset = subset 236 237 def start_doctype_decl_handler(self, doctypeName, systemId, publicId, 238 has_internal_subset): 239 doctype = self.document.implementation.createDocumentType( 240 doctypeName, publicId, systemId) 241 doctype.ownerDocument = self.document 242 _append_child(self.document, doctype) 243 self.document.doctype = doctype 244 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT: 245 self.document.doctype = None 246 del self.document.childNodes[-1] 247 doctype = None 248 self._parser.EntityDeclHandler = None 249 self._parser.NotationDeclHandler = None 250 if has_internal_subset: 251 if doctype is not None: 252 doctype.entities._seq = [] 253 doctype.notations._seq = [] 254 self._parser.CommentHandler = None 255 self._parser.ProcessingInstructionHandler = None 256 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 257 258 def end_doctype_decl_handler(self): 259 if self._options.comments: 260 self._parser.CommentHandler = self.comment_handler 261 self._parser.ProcessingInstructionHandler = self.pi_handler 262 if not (self._elem_info or self._filter): 263 self._finish_end_element = id 264 265 def pi_handler(self, target, data): 266 node = self.document.createProcessingInstruction(target, data) 267 _append_child(self.curNode, node) 268 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 269 self.curNode.removeChild(node) 270 271 def character_data_handler_cdata(self, data): 272 childNodes = self.curNode.childNodes 273 if self._cdata: 274 if ( self._cdata_continue 275 and childNodes[-1].nodeType == CDATA_SECTION_NODE): 276 childNodes[-1].appendData(data) 277 return 278 node = self.document.createCDATASection(data) 279 self._cdata_continue = True 280 elif childNodes and childNodes[-1].nodeType == TEXT_NODE: 281 node = childNodes[-1] 282 value = node.data + data 283 node.data = value 284 return 285 else: 286 node = minidom.Text() 287 node.data = data 288 node.ownerDocument = self.document 289 _append_child(self.curNode, node) 290 291 def character_data_handler(self, data): 292 childNodes = self.curNode.childNodes 293 if childNodes and childNodes[-1].nodeType == TEXT_NODE: 294 node = childNodes[-1] 295 node.data = node.data + data 296 return 297 node = minidom.Text() 298 node.data = node.data + data 299 node.ownerDocument = self.document 300 _append_child(self.curNode, node) 301 302 def entity_decl_handler(self, entityName, is_parameter_entity, value, 303 base, systemId, publicId, notationName): 304 if is_parameter_entity: 305 # we don't care about parameter entities for the DOM 306 return 307 if not self._options.entities: 308 return 309 node = self.document._create_entity(entityName, publicId, 310 systemId, notationName) 311 if value is not None: 312 # internal entity 313 # node *should* be readonly, but we'll cheat 314 child = self.document.createTextNode(value) 315 node.childNodes.append(child) 316 self.document.doctype.entities._seq.append(node) 317 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 318 del self.document.doctype.entities._seq[-1] 319 320 def notation_decl_handler(self, notationName, base, systemId, publicId): 321 node = self.document._create_notation(notationName, publicId, systemId) 322 self.document.doctype.notations._seq.append(node) 323 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT: 324 del self.document.doctype.notations._seq[-1] 325 326 def comment_handler(self, data): 327 node = self.document.createComment(data) 328 _append_child(self.curNode, node) 329 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 330 self.curNode.removeChild(node) 331 332 def start_cdata_section_handler(self): 333 self._cdata = True 334 self._cdata_continue = False 335 336 def end_cdata_section_handler(self): 337 self._cdata = False 338 self._cdata_continue = False 339 340 def external_entity_ref_handler(self, context, base, systemId, publicId): 341 return 1 342 343 def first_element_handler(self, name, attributes): 344 if self._filter is None and not self._elem_info: 345 self._finish_end_element = id 346 self.getParser().StartElementHandler = self.start_element_handler 347 self.start_element_handler(name, attributes) 348 349 def start_element_handler(self, name, attributes): 350 node = self.document.createElement(name) 351 _append_child(self.curNode, node) 352 self.curNode = node 353 354 if attributes: 355 for i in range(0, len(attributes), 2): 356 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE, 357 None, EMPTY_PREFIX) 358 value = attributes[i+1] 359 a.value = value 360 a.ownerDocument = self.document 361 _set_attribute_node(node, a) 362 363 if node is not self.document.documentElement: 364 self._finish_start_element(node) 365 366 def _finish_start_element(self, node): 367 if self._filter: 368 # To be general, we'd have to call isSameNode(), but this 369 # is sufficient for minidom: 370 if node is self.document.documentElement: 371 return 372 filt = self._filter.startContainer(node) 373 if filt == FILTER_REJECT: 374 # ignore this node & all descendents 375 Rejecter(self) 376 elif filt == FILTER_SKIP: 377 # ignore this node, but make it's children become 378 # children of the parent node 379 Skipper(self) 380 else: 381 return 382 self.curNode = node.parentNode 383 node.parentNode.removeChild(node) 384 node.unlink() 385 386 # If this ever changes, Namespaces.end_element_handler() needs to 387 # be changed to match. 388 # 389 def end_element_handler(self, name): 390 curNode = self.curNode 391 self.curNode = curNode.parentNode 392 self._finish_end_element(curNode) 393 394 def _finish_end_element(self, curNode): 395 info = self._elem_info.get(curNode.tagName) 396 if info: 397 self._handle_white_text_nodes(curNode, info) 398 if self._filter: 399 if curNode is self.document.documentElement: 400 return 401 if self._filter.acceptNode(curNode) == FILTER_REJECT: 402 self.curNode.removeChild(curNode) 403 curNode.unlink() 404 405 def _handle_white_text_nodes(self, node, info): 406 if (self._options.whitespace_in_element_content 407 or not info.isElementContent()): 408 return 409 410 # We have element type information and should remove ignorable 411 # whitespace; identify for text nodes which contain only 412 # whitespace. 413 L = [] 414 for child in node.childNodes: 415 if child.nodeType == TEXT_NODE and not child.data.strip(): 416 L.append(child) 417 418 # Remove ignorable whitespace from the tree. 419 for child in L: 420 node.removeChild(child) 421 422 def element_decl_handler(self, name, model): 423 info = self._elem_info.get(name) 424 if info is None: 425 self._elem_info[name] = ElementInfo(name, model) 426 else: 427 assert info._model is None 428 info._model = model 429 430 def attlist_decl_handler(self, elem, name, type, default, required): 431 info = self._elem_info.get(elem) 432 if info is None: 433 info = ElementInfo(elem) 434 self._elem_info[elem] = info 435 info._attr_info.append( 436 [None, name, None, None, default, 0, type, required]) 437 438 def xml_decl_handler(self, version, encoding, standalone): 439 self.document.version = version 440 self.document.encoding = encoding 441 # This is still a little ugly, thanks to the pyexpat API. ;-( 442 if standalone >= 0: 443 if standalone: 444 self.document.standalone = True 445 else: 446 self.document.standalone = False 447 448 449# Don't include FILTER_INTERRUPT, since that's checked separately 450# where allowed. 451_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP) 452 453class FilterVisibilityController(object): 454 """Wrapper around a DOMBuilderFilter which implements the checks 455 to make the whatToShow filter attribute work.""" 456 457 __slots__ = 'filter', 458 459 def __init__(self, filter): 460 self.filter = filter 461 462 def startContainer(self, node): 463 mask = self._nodetype_mask[node.nodeType] 464 if self.filter.whatToShow & mask: 465 val = self.filter.startContainer(node) 466 if val == FILTER_INTERRUPT: 467 raise ParseEscape 468 if val not in _ALLOWED_FILTER_RETURNS: 469 raise ValueError( 470 "startContainer() returned illegal value: " + repr(val)) 471 return val 472 else: 473 return FILTER_ACCEPT 474 475 def acceptNode(self, node): 476 mask = self._nodetype_mask[node.nodeType] 477 if self.filter.whatToShow & mask: 478 val = self.filter.acceptNode(node) 479 if val == FILTER_INTERRUPT: 480 raise ParseEscape 481 if val == FILTER_SKIP: 482 # move all child nodes to the parent, and remove this node 483 parent = node.parentNode 484 for child in node.childNodes[:]: 485 parent.appendChild(child) 486 # node is handled by the caller 487 return FILTER_REJECT 488 if val not in _ALLOWED_FILTER_RETURNS: 489 raise ValueError( 490 "acceptNode() returned illegal value: " + repr(val)) 491 return val 492 else: 493 return FILTER_ACCEPT 494 495 _nodetype_mask = { 496 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT, 497 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE, 498 Node.TEXT_NODE: NodeFilter.SHOW_TEXT, 499 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION, 500 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE, 501 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY, 502 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION, 503 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT, 504 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT, 505 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE, 506 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT, 507 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION, 508 } 509 510 511class FilterCrutch(object): 512 __slots__ = '_builder', '_level', '_old_start', '_old_end' 513 514 def __init__(self, builder): 515 self._level = 0 516 self._builder = builder 517 parser = builder._parser 518 self._old_start = parser.StartElementHandler 519 self._old_end = parser.EndElementHandler 520 parser.StartElementHandler = self.start_element_handler 521 parser.EndElementHandler = self.end_element_handler 522 523class Rejecter(FilterCrutch): 524 __slots__ = () 525 526 def __init__(self, builder): 527 FilterCrutch.__init__(self, builder) 528 parser = builder._parser 529 for name in ("ProcessingInstructionHandler", 530 "CommentHandler", 531 "CharacterDataHandler", 532 "StartCdataSectionHandler", 533 "EndCdataSectionHandler", 534 "ExternalEntityRefHandler", 535 ): 536 setattr(parser, name, None) 537 538 def start_element_handler(self, *args): 539 self._level = self._level + 1 540 541 def end_element_handler(self, *args): 542 if self._level == 0: 543 # restore the old handlers 544 parser = self._builder._parser 545 self._builder.install(parser) 546 parser.StartElementHandler = self._old_start 547 parser.EndElementHandler = self._old_end 548 else: 549 self._level = self._level - 1 550 551class Skipper(FilterCrutch): 552 __slots__ = () 553 554 def start_element_handler(self, *args): 555 node = self._builder.curNode 556 self._old_start(*args) 557 if self._builder.curNode is not node: 558 self._level = self._level + 1 559 560 def end_element_handler(self, *args): 561 if self._level == 0: 562 # We're popping back out of the node we're skipping, so we 563 # shouldn't need to do anything but reset the handlers. 564 self._builder._parser.StartElementHandler = self._old_start 565 self._builder._parser.EndElementHandler = self._old_end 566 self._builder = None 567 else: 568 self._level = self._level - 1 569 self._old_end(*args) 570 571 572# framework document used by the fragment builder. 573# Takes a string for the doctype, subset string, and namespace attrs string. 574 575_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \ 576 "http://xml.python.org/entities/fragment-builder/internal" 577 578_FRAGMENT_BUILDER_TEMPLATE = ( 579 '''\ 580<!DOCTYPE wrapper 581 %%s [ 582 <!ENTITY fragment-builder-internal 583 SYSTEM "%s"> 584%%s 585]> 586<wrapper %%s 587>&fragment-builder-internal;</wrapper>''' 588 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID) 589 590 591class FragmentBuilder(ExpatBuilder): 592 """Builder which constructs document fragments given XML source 593 text and a context node. 594 595 The context node is expected to provide information about the 596 namespace declarations which are in scope at the start of the 597 fragment. 598 """ 599 600 def __init__(self, context, options=None): 601 if context.nodeType == DOCUMENT_NODE: 602 self.originalDocument = context 603 self.context = context 604 else: 605 self.originalDocument = context.ownerDocument 606 self.context = context 607 ExpatBuilder.__init__(self, options) 608 609 def reset(self): 610 ExpatBuilder.reset(self) 611 self.fragment = None 612 613 def parseFile(self, file): 614 """Parse a document fragment from a file object, returning the 615 fragment node.""" 616 return self.parseString(file.read()) 617 618 def parseString(self, string): 619 """Parse a document fragment from a string, returning the 620 fragment node.""" 621 self._source = string 622 parser = self.getParser() 623 doctype = self.originalDocument.doctype 624 ident = "" 625 if doctype: 626 subset = doctype.internalSubset or self._getDeclarations() 627 if doctype.publicId: 628 ident = ('PUBLIC "%s" "%s"' 629 % (doctype.publicId, doctype.systemId)) 630 elif doctype.systemId: 631 ident = 'SYSTEM "%s"' % doctype.systemId 632 else: 633 subset = "" 634 nsattrs = self._getNSattrs() # get ns decls from node's ancestors 635 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) 636 try: 637 parser.Parse(document, True) 638 except: 639 self.reset() 640 raise 641 fragment = self.fragment 642 self.reset() 643## self._parser = None 644 return fragment 645 646 def _getDeclarations(self): 647 """Re-create the internal subset from the DocumentType node. 648 649 This is only needed if we don't already have the 650 internalSubset as a string. 651 """ 652 doctype = self.context.ownerDocument.doctype 653 s = "" 654 if doctype: 655 for i in range(doctype.notations.length): 656 notation = doctype.notations.item(i) 657 if s: 658 s = s + "\n " 659 s = "%s<!NOTATION %s" % (s, notation.nodeName) 660 if notation.publicId: 661 s = '%s PUBLIC "%s"\n "%s">' \ 662 % (s, notation.publicId, notation.systemId) 663 else: 664 s = '%s SYSTEM "%s">' % (s, notation.systemId) 665 for i in range(doctype.entities.length): 666 entity = doctype.entities.item(i) 667 if s: 668 s = s + "\n " 669 s = "%s<!ENTITY %s" % (s, entity.nodeName) 670 if entity.publicId: 671 s = '%s PUBLIC "%s"\n "%s"' \ 672 % (s, entity.publicId, entity.systemId) 673 elif entity.systemId: 674 s = '%s SYSTEM "%s"' % (s, entity.systemId) 675 else: 676 s = '%s "%s"' % (s, entity.firstChild.data) 677 if entity.notationName: 678 s = "%s NOTATION %s" % (s, entity.notationName) 679 s = s + ">" 680 return s 681 682 def _getNSattrs(self): 683 return "" 684 685 def external_entity_ref_handler(self, context, base, systemId, publicId): 686 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID: 687 # this entref is the one that we made to put the subtree 688 # in; all of our given input is parsed in here. 689 old_document = self.document 690 old_cur_node = self.curNode 691 parser = self._parser.ExternalEntityParserCreate(context) 692 # put the real document back, parse into the fragment to return 693 self.document = self.originalDocument 694 self.fragment = self.document.createDocumentFragment() 695 self.curNode = self.fragment 696 try: 697 parser.Parse(self._source, True) 698 finally: 699 self.curNode = old_cur_node 700 self.document = old_document 701 self._source = None 702 return -1 703 else: 704 return ExpatBuilder.external_entity_ref_handler( 705 self, context, base, systemId, publicId) 706 707 708class Namespaces: 709 """Mix-in class for builders; adds support for namespaces.""" 710 711 def _initNamespaces(self): 712 # list of (prefix, uri) ns declarations. Namespace attrs are 713 # constructed from this and added to the element's attrs. 714 self._ns_ordered_prefixes = [] 715 716 def createParser(self): 717 """Create a new namespace-handling parser.""" 718 parser = expat.ParserCreate(namespace_separator=" ") 719 parser.namespace_prefixes = True 720 return parser 721 722 def install(self, parser): 723 """Insert the namespace-handlers onto the parser.""" 724 ExpatBuilder.install(self, parser) 725 if self._options.namespace_declarations: 726 parser.StartNamespaceDeclHandler = ( 727 self.start_namespace_decl_handler) 728 729 def start_namespace_decl_handler(self, prefix, uri): 730 """Push this namespace declaration on our storage.""" 731 self._ns_ordered_prefixes.append((prefix, uri)) 732 733 def start_element_handler(self, name, attributes): 734 if ' ' in name: 735 uri, localname, prefix, qname = _parse_ns_name(self, name) 736 else: 737 uri = EMPTY_NAMESPACE 738 qname = name 739 localname = None 740 prefix = EMPTY_PREFIX 741 node = minidom.Element(qname, uri, prefix, localname) 742 node.ownerDocument = self.document 743 _append_child(self.curNode, node) 744 self.curNode = node 745 746 if self._ns_ordered_prefixes: 747 for prefix, uri in self._ns_ordered_prefixes: 748 if prefix: 749 a = minidom.Attr(_intern(self, 'xmlns:' + prefix), 750 XMLNS_NAMESPACE, prefix, "xmlns") 751 else: 752 a = minidom.Attr("xmlns", XMLNS_NAMESPACE, 753 "xmlns", EMPTY_PREFIX) 754 a.value = uri 755 a.ownerDocument = self.document 756 _set_attribute_node(node, a) 757 del self._ns_ordered_prefixes[:] 758 759 if attributes: 760 node._ensure_attributes() 761 _attrs = node._attrs 762 _attrsNS = node._attrsNS 763 for i in range(0, len(attributes), 2): 764 aname = attributes[i] 765 value = attributes[i+1] 766 if ' ' in aname: 767 uri, localname, prefix, qname = _parse_ns_name(self, aname) 768 a = minidom.Attr(qname, uri, localname, prefix) 769 _attrs[qname] = a 770 _attrsNS[(uri, localname)] = a 771 else: 772 a = minidom.Attr(aname, EMPTY_NAMESPACE, 773 aname, EMPTY_PREFIX) 774 _attrs[aname] = a 775 _attrsNS[(EMPTY_NAMESPACE, aname)] = a 776 a.ownerDocument = self.document 777 a.value = value 778 a.ownerElement = node 779 780 if __debug__: 781 # This only adds some asserts to the original 782 # end_element_handler(), so we only define this when -O is not 783 # used. If changing one, be sure to check the other to see if 784 # it needs to be changed as well. 785 # 786 def end_element_handler(self, name): 787 curNode = self.curNode 788 if ' ' in name: 789 uri, localname, prefix, qname = _parse_ns_name(self, name) 790 assert (curNode.namespaceURI == uri 791 and curNode.localName == localname 792 and curNode.prefix == prefix), \ 793 "element stack messed up! (namespace)" 794 else: 795 assert curNode.nodeName == name, \ 796 "element stack messed up - bad nodeName" 797 assert curNode.namespaceURI == EMPTY_NAMESPACE, \ 798 "element stack messed up - bad namespaceURI" 799 self.curNode = curNode.parentNode 800 self._finish_end_element(curNode) 801 802 803class ExpatBuilderNS(Namespaces, ExpatBuilder): 804 """Document builder that supports namespaces.""" 805 806 def reset(self): 807 ExpatBuilder.reset(self) 808 self._initNamespaces() 809 810 811class FragmentBuilderNS(Namespaces, FragmentBuilder): 812 """Fragment builder that supports namespaces.""" 813 814 def reset(self): 815 FragmentBuilder.reset(self) 816 self._initNamespaces() 817 818 def _getNSattrs(self): 819 """Return string of namespace attributes from this element and 820 ancestors.""" 821 # XXX This needs to be re-written to walk the ancestors of the 822 # context to build up the namespace information from 823 # declarations, elements, and attributes found in context. 824 # Otherwise we have to store a bunch more data on the DOM 825 # (though that *might* be more reliable -- not clear). 826 attrs = "" 827 context = self.context 828 L = [] 829 while context: 830 if hasattr(context, '_ns_prefix_uri'): 831 for prefix, uri in context._ns_prefix_uri.items(): 832 # add every new NS decl from context to L and attrs string 833 if prefix in L: 834 continue 835 L.append(prefix) 836 if prefix: 837 declname = "xmlns:" + prefix 838 else: 839 declname = "xmlns" 840 if attrs: 841 attrs = "%s\n %s='%s'" % (attrs, declname, uri) 842 else: 843 attrs = " %s='%s'" % (declname, uri) 844 context = context.parentNode 845 return attrs 846 847 848class ParseEscape(Exception): 849 """Exception raised to short-circuit parsing in InternalSubsetExtractor.""" 850 pass 851 852class InternalSubsetExtractor(ExpatBuilder): 853 """XML processor which can rip out the internal document type subset.""" 854 855 subset = None 856 857 def getSubset(self): 858 """Return the internal subset as a string.""" 859 return self.subset 860 861 def parseFile(self, file): 862 try: 863 ExpatBuilder.parseFile(self, file) 864 except ParseEscape: 865 pass 866 867 def parseString(self, string): 868 try: 869 ExpatBuilder.parseString(self, string) 870 except ParseEscape: 871 pass 872 873 def install(self, parser): 874 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 875 parser.StartElementHandler = self.start_element_handler 876 877 def start_doctype_decl_handler(self, name, publicId, systemId, 878 has_internal_subset): 879 if has_internal_subset: 880 parser = self.getParser() 881 self.subset = [] 882 parser.DefaultHandler = self.subset.append 883 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 884 else: 885 raise ParseEscape() 886 887 def end_doctype_decl_handler(self): 888 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n') 889 self.subset = s 890 raise ParseEscape() 891 892 def start_element_handler(self, name, attrs): 893 raise ParseEscape() 894 895 896def parse(file, namespaces=True): 897 """Parse a document, returning the resulting Document node. 898 899 'file' may be either a file name or an open file object. 900 """ 901 if namespaces: 902 builder = ExpatBuilderNS() 903 else: 904 builder = ExpatBuilder() 905 906 if isinstance(file, str): 907 with open(file, 'rb') as fp: 908 result = builder.parseFile(fp) 909 else: 910 result = builder.parseFile(file) 911 return result 912 913 914def parseString(string, namespaces=True): 915 """Parse a document from a string, returning the resulting 916 Document node. 917 """ 918 if namespaces: 919 builder = ExpatBuilderNS() 920 else: 921 builder = ExpatBuilder() 922 return builder.parseString(string) 923 924 925def parseFragment(file, context, namespaces=True): 926 """Parse a fragment of a document, given the context from which it 927 was originally extracted. context should be the parent of the 928 node(s) which are in the fragment. 929 930 'file' may be either a file name or an open file object. 931 """ 932 if namespaces: 933 builder = FragmentBuilderNS(context) 934 else: 935 builder = FragmentBuilder(context) 936 937 if isinstance(file, str): 938 with open(file, 'rb') as fp: 939 result = builder.parseFile(fp) 940 else: 941 result = builder.parseFile(file) 942 return result 943 944 945def parseFragmentString(string, context, namespaces=True): 946 """Parse a fragment of a document from a string, given the context 947 from which it was originally extracted. context should be the 948 parent of the node(s) which are in the fragment. 949 """ 950 if namespaces: 951 builder = FragmentBuilderNS(context) 952 else: 953 builder = FragmentBuilder(context) 954 return builder.parseString(string) 955 956 957def makeBuilder(options): 958 """Create a builder based on an Options object.""" 959 if options.namespaces: 960 return ExpatBuilderNS(options) 961 else: 962 return ExpatBuilder(options) 963