• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Facility to use the Expat parser to load a minidom instance
2from a string or file.
3
4This avoids all the overhead of SAX and pulldom to gain performance.
5"""
6
7# Warning!
8#
9# This module is tightly bound to the implementation details of the
10# minidom DOM and can't be used with other DOM implementations.  This
11# is due, in part, to a lack of appropriate methods in the DOM (there is
12# no way to create Entity and Notation nodes via the DOM Level 2
13# interface), and for performance.  The latter is the cause of some fairly
14# cryptic code.
15#
16# Performance hacks:
17#
18#   -  .character_data_handler() has an extra case in which continuing
19#      data is appended to an existing Text node; this can be a
20#      speedup since pyexpat can break up character data into multiple
21#      callbacks even though we set the buffer_text attribute on the
22#      parser.  This also gives us the advantage that we don't need a
23#      separate normalization pass.
24#
25#   -  Determining that a node exists is done using an identity comparison
26#      with None rather than a truth test; this avoids searching for and
27#      calling any methods on the node object if it exists.  (A rather
28#      nice speedup is achieved this way as well!)
29
30from xml.dom import xmlbuilder, minidom, Node
31from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32from xml.parsers import expat
33from xml.dom.minidom import _append_child, _set_attribute_node
34from xml.dom.NodeFilter import NodeFilter
35
36TEXT_NODE = Node.TEXT_NODE
37CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38DOCUMENT_NODE = Node.DOCUMENT_NODE
39
40FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44
45theDOMImplementation = minidom.getDOMImplementation()
46
47# Expat typename -> TypeInfo
48_typeinfo_map = {
49    "CDATA":    minidom.TypeInfo(None, "cdata"),
50    "ENUM":     minidom.TypeInfo(None, "enumeration"),
51    "ENTITY":   minidom.TypeInfo(None, "entity"),
52    "ENTITIES": minidom.TypeInfo(None, "entities"),
53    "ID":       minidom.TypeInfo(None, "id"),
54    "IDREF":    minidom.TypeInfo(None, "idref"),
55    "IDREFS":   minidom.TypeInfo(None, "idrefs"),
56    "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
57    "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58    }
59
60class ElementInfo(object):
61    __slots__ = '_attr_info', '_model', 'tagName'
62
63    def __init__(self, tagName, model=None):
64        self.tagName = tagName
65        self._attr_info = []
66        self._model = model
67
68    def __getstate__(self):
69        return self._attr_info, self._model, self.tagName
70
71    def __setstate__(self, state):
72        self._attr_info, self._model, self.tagName = state
73
74    def getAttributeType(self, aname):
75        for info in self._attr_info:
76            if info[1] == aname:
77                t = info[-2]
78                if t[0] == "(":
79                    return _typeinfo_map["ENUM"]
80                else:
81                    return _typeinfo_map[info[-2]]
82        return minidom._no_type
83
84    def getAttributeTypeNS(self, namespaceURI, localName):
85        return minidom._no_type
86
87    def isElementContent(self):
88        if self._model:
89            type = self._model[0]
90            return type not in (expat.model.XML_CTYPE_ANY,
91                                expat.model.XML_CTYPE_MIXED)
92        else:
93            return False
94
95    def isEmpty(self):
96        if self._model:
97            return self._model[0] == expat.model.XML_CTYPE_EMPTY
98        else:
99            return False
100
101    def isId(self, aname):
102        for info in self._attr_info:
103            if info[1] == aname:
104                return info[-2] == "ID"
105        return False
106
107    def isIdNS(self, euri, ename, auri, aname):
108        # not sure this is meaningful
109        return self.isId((auri, aname))
110
111def _intern(builder, s):
112    return builder._intern_setdefault(s, s)
113
114def _parse_ns_name(builder, name):
115    assert ' ' in name
116    parts = name.split(' ')
117    intern = builder._intern_setdefault
118    if len(parts) == 3:
119        uri, localname, prefix = parts
120        prefix = intern(prefix, prefix)
121        qname = "%s:%s" % (prefix, localname)
122        qname = intern(qname, qname)
123        localname = intern(localname, localname)
124    elif len(parts) == 2:
125        uri, localname = parts
126        prefix = EMPTY_PREFIX
127        qname = localname = intern(localname, localname)
128    else:
129        raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
130    return intern(uri, uri), localname, prefix, qname
131
132
133class ExpatBuilder:
134    """Document builder that uses Expat to build a ParsedXML.DOM document
135    instance."""
136
137    def __init__(self, options=None):
138        if options is None:
139            options = xmlbuilder.Options()
140        self._options = options
141        if self._options.filter is not None:
142            self._filter = FilterVisibilityController(self._options.filter)
143        else:
144            self._filter = None
145            # This *really* doesn't do anything in this case, so
146            # override it with something fast & minimal.
147            self._finish_start_element = id
148        self._parser = None
149        self.reset()
150
151    def createParser(self):
152        """Create a new parser object."""
153        return expat.ParserCreate()
154
155    def getParser(self):
156        """Return the parser object, creating a new one if needed."""
157        if not self._parser:
158            self._parser = self.createParser()
159            self._intern_setdefault = self._parser.intern.setdefault
160            self._parser.buffer_text = True
161            self._parser.ordered_attributes = True
162            self._parser.specified_attributes = True
163            self.install(self._parser)
164        return self._parser
165
166    def reset(self):
167        """Free all data structures used during DOM construction."""
168        self.document = theDOMImplementation.createDocument(
169            EMPTY_NAMESPACE, None, None)
170        self.curNode = self.document
171        self._elem_info = self.document._elem_info
172        self._cdata = False
173
174    def install(self, parser):
175        """Install the callbacks needed to build the DOM into the parser."""
176        # This creates circular references!
177        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178        parser.StartElementHandler = self.first_element_handler
179        parser.EndElementHandler = self.end_element_handler
180        parser.ProcessingInstructionHandler = self.pi_handler
181        if self._options.entities:
182            parser.EntityDeclHandler = self.entity_decl_handler
183        parser.NotationDeclHandler = self.notation_decl_handler
184        if self._options.comments:
185            parser.CommentHandler = self.comment_handler
186        if self._options.cdata_sections:
187            parser.StartCdataSectionHandler = self.start_cdata_section_handler
188            parser.EndCdataSectionHandler = self.end_cdata_section_handler
189            parser.CharacterDataHandler = self.character_data_handler_cdata
190        else:
191            parser.CharacterDataHandler = self.character_data_handler
192        parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193        parser.XmlDeclHandler = self.xml_decl_handler
194        parser.ElementDeclHandler = self.element_decl_handler
195        parser.AttlistDeclHandler = self.attlist_decl_handler
196
197    def parseFile(self, file):
198        """Parse a document from a file object, returning the document
199        node."""
200        parser = self.getParser()
201        first_buffer = True
202        try:
203            while 1:
204                buffer = file.read(16*1024)
205                if not buffer:
206                    break
207                parser.Parse(buffer, False)
208                if first_buffer and self.document.documentElement:
209                    self._setup_subset(buffer)
210                first_buffer = False
211            parser.Parse(b"", True)
212        except ParseEscape:
213            pass
214        doc = self.document
215        self.reset()
216        self._parser = None
217        return doc
218
219    def parseString(self, string):
220        """Parse a document from a string, returning the document node."""
221        parser = self.getParser()
222        try:
223            parser.Parse(string, True)
224            self._setup_subset(string)
225        except ParseEscape:
226            pass
227        doc = self.document
228        self.reset()
229        self._parser = None
230        return doc
231
232    def _setup_subset(self, buffer):
233        """Load the internal subset if there might be one."""
234        if self.document.doctype:
235            extractor = InternalSubsetExtractor()
236            extractor.parseString(buffer)
237            subset = extractor.getSubset()
238            self.document.doctype.internalSubset = subset
239
240    def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241                                   has_internal_subset):
242        doctype = self.document.implementation.createDocumentType(
243            doctypeName, publicId, systemId)
244        doctype.ownerDocument = self.document
245        _append_child(self.document, doctype)
246        self.document.doctype = doctype
247        if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248            self.document.doctype = None
249            del self.document.childNodes[-1]
250            doctype = None
251            self._parser.EntityDeclHandler = None
252            self._parser.NotationDeclHandler = None
253        if has_internal_subset:
254            if doctype is not None:
255                doctype.entities._seq = []
256                doctype.notations._seq = []
257            self._parser.CommentHandler = None
258            self._parser.ProcessingInstructionHandler = None
259            self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260
261    def end_doctype_decl_handler(self):
262        if self._options.comments:
263            self._parser.CommentHandler = self.comment_handler
264        self._parser.ProcessingInstructionHandler = self.pi_handler
265        if not (self._elem_info or self._filter):
266            self._finish_end_element = id
267
268    def pi_handler(self, target, data):
269        node = self.document.createProcessingInstruction(target, data)
270        _append_child(self.curNode, node)
271        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272            self.curNode.removeChild(node)
273
274    def character_data_handler_cdata(self, data):
275        childNodes = self.curNode.childNodes
276        if self._cdata:
277            if (  self._cdata_continue
278                  and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279                childNodes[-1].appendData(data)
280                return
281            node = self.document.createCDATASection(data)
282            self._cdata_continue = True
283        elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284            node = childNodes[-1]
285            value = node.data + data
286            node.data = value
287            return
288        else:
289            node = minidom.Text()
290            node.data = data
291            node.ownerDocument = self.document
292        _append_child(self.curNode, node)
293
294    def character_data_handler(self, data):
295        childNodes = self.curNode.childNodes
296        if childNodes and childNodes[-1].nodeType == TEXT_NODE:
297            node = childNodes[-1]
298            node.data = node.data + data
299            return
300        node = minidom.Text()
301        node.data = node.data + data
302        node.ownerDocument = self.document
303        _append_child(self.curNode, node)
304
305    def entity_decl_handler(self, entityName, is_parameter_entity, value,
306                            base, systemId, publicId, notationName):
307        if is_parameter_entity:
308            # we don't care about parameter entities for the DOM
309            return
310        if not self._options.entities:
311            return
312        node = self.document._create_entity(entityName, publicId,
313                                            systemId, notationName)
314        if value is not None:
315            # internal entity
316            # node *should* be readonly, but we'll cheat
317            child = self.document.createTextNode(value)
318            node.childNodes.append(child)
319        self.document.doctype.entities._seq.append(node)
320        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
321            del self.document.doctype.entities._seq[-1]
322
323    def notation_decl_handler(self, notationName, base, systemId, publicId):
324        node = self.document._create_notation(notationName, publicId, systemId)
325        self.document.doctype.notations._seq.append(node)
326        if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
327            del self.document.doctype.notations._seq[-1]
328
329    def comment_handler(self, data):
330        node = self.document.createComment(data)
331        _append_child(self.curNode, node)
332        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
333            self.curNode.removeChild(node)
334
335    def start_cdata_section_handler(self):
336        self._cdata = True
337        self._cdata_continue = False
338
339    def end_cdata_section_handler(self):
340        self._cdata = False
341        self._cdata_continue = False
342
343    def external_entity_ref_handler(self, context, base, systemId, publicId):
344        return 1
345
346    def first_element_handler(self, name, attributes):
347        if self._filter is None and not self._elem_info:
348            self._finish_end_element = id
349        self.getParser().StartElementHandler = self.start_element_handler
350        self.start_element_handler(name, attributes)
351
352    def start_element_handler(self, name, attributes):
353        node = self.document.createElement(name)
354        _append_child(self.curNode, node)
355        self.curNode = node
356
357        if attributes:
358            for i in range(0, len(attributes), 2):
359                a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
360                                 None, EMPTY_PREFIX)
361                value = attributes[i+1]
362                a.value = value
363                a.ownerDocument = self.document
364                _set_attribute_node(node, a)
365
366        if node is not self.document.documentElement:
367            self._finish_start_element(node)
368
369    def _finish_start_element(self, node):
370        if self._filter:
371            # To be general, we'd have to call isSameNode(), but this
372            # is sufficient for minidom:
373            if node is self.document.documentElement:
374                return
375            filt = self._filter.startContainer(node)
376            if filt == FILTER_REJECT:
377                # ignore this node & all descendents
378                Rejecter(self)
379            elif filt == FILTER_SKIP:
380                # ignore this node, but make it's children become
381                # children of the parent node
382                Skipper(self)
383            else:
384                return
385            self.curNode = node.parentNode
386            node.parentNode.removeChild(node)
387            node.unlink()
388
389    # If this ever changes, Namespaces.end_element_handler() needs to
390    # be changed to match.
391    #
392    def end_element_handler(self, name):
393        curNode = self.curNode
394        self.curNode = curNode.parentNode
395        self._finish_end_element(curNode)
396
397    def _finish_end_element(self, curNode):
398        info = self._elem_info.get(curNode.tagName)
399        if info:
400            self._handle_white_text_nodes(curNode, info)
401        if self._filter:
402            if curNode is self.document.documentElement:
403                return
404            if self._filter.acceptNode(curNode) == FILTER_REJECT:
405                self.curNode.removeChild(curNode)
406                curNode.unlink()
407
408    def _handle_white_text_nodes(self, node, info):
409        if (self._options.whitespace_in_element_content
410            or not info.isElementContent()):
411            return
412
413        # We have element type information and should remove ignorable
414        # whitespace; identify for text nodes which contain only
415        # whitespace.
416        L = []
417        for child in node.childNodes:
418            if child.nodeType == TEXT_NODE and not child.data.strip():
419                L.append(child)
420
421        # Remove ignorable whitespace from the tree.
422        for child in L:
423            node.removeChild(child)
424
425    def element_decl_handler(self, name, model):
426        info = self._elem_info.get(name)
427        if info is None:
428            self._elem_info[name] = ElementInfo(name, model)
429        else:
430            assert info._model is None
431            info._model = model
432
433    def attlist_decl_handler(self, elem, name, type, default, required):
434        info = self._elem_info.get(elem)
435        if info is None:
436            info = ElementInfo(elem)
437            self._elem_info[elem] = info
438        info._attr_info.append(
439            [None, name, None, None, default, 0, type, required])
440
441    def xml_decl_handler(self, version, encoding, standalone):
442        self.document.version = version
443        self.document.encoding = encoding
444        # This is still a little ugly, thanks to the pyexpat API. ;-(
445        if standalone >= 0:
446            if standalone:
447                self.document.standalone = True
448            else:
449                self.document.standalone = False
450
451
452# Don't include FILTER_INTERRUPT, since that's checked separately
453# where allowed.
454_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
455
456class FilterVisibilityController(object):
457    """Wrapper around a DOMBuilderFilter which implements the checks
458    to make the whatToShow filter attribute work."""
459
460    __slots__ = 'filter',
461
462    def __init__(self, filter):
463        self.filter = filter
464
465    def startContainer(self, node):
466        mask = self._nodetype_mask[node.nodeType]
467        if self.filter.whatToShow & mask:
468            val = self.filter.startContainer(node)
469            if val == FILTER_INTERRUPT:
470                raise ParseEscape
471            if val not in _ALLOWED_FILTER_RETURNS:
472                raise ValueError(
473                      "startContainer() returned illegal value: " + repr(val))
474            return val
475        else:
476            return FILTER_ACCEPT
477
478    def acceptNode(self, node):
479        mask = self._nodetype_mask[node.nodeType]
480        if self.filter.whatToShow & mask:
481            val = self.filter.acceptNode(node)
482            if val == FILTER_INTERRUPT:
483                raise ParseEscape
484            if val == FILTER_SKIP:
485                # move all child nodes to the parent, and remove this node
486                parent = node.parentNode
487                for child in node.childNodes[:]:
488                    parent.appendChild(child)
489                # node is handled by the caller
490                return FILTER_REJECT
491            if val not in _ALLOWED_FILTER_RETURNS:
492                raise ValueError(
493                      "acceptNode() returned illegal value: " + repr(val))
494            return val
495        else:
496            return FILTER_ACCEPT
497
498    _nodetype_mask = {
499        Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
500        Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
501        Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
502        Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
503        Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
504        Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
505        Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
506        Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
507        Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
508        Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
509        Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
510        Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
511        }
512
513
514class FilterCrutch(object):
515    __slots__ = '_builder', '_level', '_old_start', '_old_end'
516
517    def __init__(self, builder):
518        self._level = 0
519        self._builder = builder
520        parser = builder._parser
521        self._old_start = parser.StartElementHandler
522        self._old_end = parser.EndElementHandler
523        parser.StartElementHandler = self.start_element_handler
524        parser.EndElementHandler = self.end_element_handler
525
526class Rejecter(FilterCrutch):
527    __slots__ = ()
528
529    def __init__(self, builder):
530        FilterCrutch.__init__(self, builder)
531        parser = builder._parser
532        for name in ("ProcessingInstructionHandler",
533                     "CommentHandler",
534                     "CharacterDataHandler",
535                     "StartCdataSectionHandler",
536                     "EndCdataSectionHandler",
537                     "ExternalEntityRefHandler",
538                     ):
539            setattr(parser, name, None)
540
541    def start_element_handler(self, *args):
542        self._level = self._level + 1
543
544    def end_element_handler(self, *args):
545        if self._level == 0:
546            # restore the old handlers
547            parser = self._builder._parser
548            self._builder.install(parser)
549            parser.StartElementHandler = self._old_start
550            parser.EndElementHandler = self._old_end
551        else:
552            self._level = self._level - 1
553
554class Skipper(FilterCrutch):
555    __slots__ = ()
556
557    def start_element_handler(self, *args):
558        node = self._builder.curNode
559        self._old_start(*args)
560        if self._builder.curNode is not node:
561            self._level = self._level + 1
562
563    def end_element_handler(self, *args):
564        if self._level == 0:
565            # We're popping back out of the node we're skipping, so we
566            # shouldn't need to do anything but reset the handlers.
567            self._builder._parser.StartElementHandler = self._old_start
568            self._builder._parser.EndElementHandler = self._old_end
569            self._builder = None
570        else:
571            self._level = self._level - 1
572            self._old_end(*args)
573
574
575# framework document used by the fragment builder.
576# Takes a string for the doctype, subset string, and namespace attrs string.
577
578_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
579    "http://xml.python.org/entities/fragment-builder/internal"
580
581_FRAGMENT_BUILDER_TEMPLATE = (
582    '''\
583<!DOCTYPE wrapper
584  %%s [
585  <!ENTITY fragment-builder-internal
586    SYSTEM "%s">
587%%s
588]>
589<wrapper %%s
590>&fragment-builder-internal;</wrapper>'''
591    % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
592
593
594class FragmentBuilder(ExpatBuilder):
595    """Builder which constructs document fragments given XML source
596    text and a context node.
597
598    The context node is expected to provide information about the
599    namespace declarations which are in scope at the start of the
600    fragment.
601    """
602
603    def __init__(self, context, options=None):
604        if context.nodeType == DOCUMENT_NODE:
605            self.originalDocument = context
606            self.context = context
607        else:
608            self.originalDocument = context.ownerDocument
609            self.context = context
610        ExpatBuilder.__init__(self, options)
611
612    def reset(self):
613        ExpatBuilder.reset(self)
614        self.fragment = None
615
616    def parseFile(self, file):
617        """Parse a document fragment from a file object, returning the
618        fragment node."""
619        return self.parseString(file.read())
620
621    def parseString(self, string):
622        """Parse a document fragment from a string, returning the
623        fragment node."""
624        self._source = string
625        parser = self.getParser()
626        doctype = self.originalDocument.doctype
627        ident = ""
628        if doctype:
629            subset = doctype.internalSubset or self._getDeclarations()
630            if doctype.publicId:
631                ident = ('PUBLIC "%s" "%s"'
632                         % (doctype.publicId, doctype.systemId))
633            elif doctype.systemId:
634                ident = 'SYSTEM "%s"' % doctype.systemId
635        else:
636            subset = ""
637        nsattrs = self._getNSattrs() # get ns decls from node's ancestors
638        document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
639        try:
640            parser.Parse(document, True)
641        except:
642            self.reset()
643            raise
644        fragment = self.fragment
645        self.reset()
646##         self._parser = None
647        return fragment
648
649    def _getDeclarations(self):
650        """Re-create the internal subset from the DocumentType node.
651
652        This is only needed if we don't already have the
653        internalSubset as a string.
654        """
655        doctype = self.context.ownerDocument.doctype
656        s = ""
657        if doctype:
658            for i in range(doctype.notations.length):
659                notation = doctype.notations.item(i)
660                if s:
661                    s = s + "\n  "
662                s = "%s<!NOTATION %s" % (s, notation.nodeName)
663                if notation.publicId:
664                    s = '%s PUBLIC "%s"\n             "%s">' \
665                        % (s, notation.publicId, notation.systemId)
666                else:
667                    s = '%s SYSTEM "%s">' % (s, notation.systemId)
668            for i in range(doctype.entities.length):
669                entity = doctype.entities.item(i)
670                if s:
671                    s = s + "\n  "
672                s = "%s<!ENTITY %s" % (s, entity.nodeName)
673                if entity.publicId:
674                    s = '%s PUBLIC "%s"\n             "%s"' \
675                        % (s, entity.publicId, entity.systemId)
676                elif entity.systemId:
677                    s = '%s SYSTEM "%s"' % (s, entity.systemId)
678                else:
679                    s = '%s "%s"' % (s, entity.firstChild.data)
680                if entity.notationName:
681                    s = "%s NOTATION %s" % (s, entity.notationName)
682                s = s + ">"
683        return s
684
685    def _getNSattrs(self):
686        return ""
687
688    def external_entity_ref_handler(self, context, base, systemId, publicId):
689        if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
690            # this entref is the one that we made to put the subtree
691            # in; all of our given input is parsed in here.
692            old_document = self.document
693            old_cur_node = self.curNode
694            parser = self._parser.ExternalEntityParserCreate(context)
695            # put the real document back, parse into the fragment to return
696            self.document = self.originalDocument
697            self.fragment = self.document.createDocumentFragment()
698            self.curNode = self.fragment
699            try:
700                parser.Parse(self._source, True)
701            finally:
702                self.curNode = old_cur_node
703                self.document = old_document
704                self._source = None
705            return -1
706        else:
707            return ExpatBuilder.external_entity_ref_handler(
708                self, context, base, systemId, publicId)
709
710
711class Namespaces:
712    """Mix-in class for builders; adds support for namespaces."""
713
714    def _initNamespaces(self):
715        # list of (prefix, uri) ns declarations.  Namespace attrs are
716        # constructed from this and added to the element's attrs.
717        self._ns_ordered_prefixes = []
718
719    def createParser(self):
720        """Create a new namespace-handling parser."""
721        parser = expat.ParserCreate(namespace_separator=" ")
722        parser.namespace_prefixes = True
723        return parser
724
725    def install(self, parser):
726        """Insert the namespace-handlers onto the parser."""
727        ExpatBuilder.install(self, parser)
728        if self._options.namespace_declarations:
729            parser.StartNamespaceDeclHandler = (
730                self.start_namespace_decl_handler)
731
732    def start_namespace_decl_handler(self, prefix, uri):
733        """Push this namespace declaration on our storage."""
734        self._ns_ordered_prefixes.append((prefix, uri))
735
736    def start_element_handler(self, name, attributes):
737        if ' ' in name:
738            uri, localname, prefix, qname = _parse_ns_name(self, name)
739        else:
740            uri = EMPTY_NAMESPACE
741            qname = name
742            localname = None
743            prefix = EMPTY_PREFIX
744        node = minidom.Element(qname, uri, prefix, localname)
745        node.ownerDocument = self.document
746        _append_child(self.curNode, node)
747        self.curNode = node
748
749        if self._ns_ordered_prefixes:
750            for prefix, uri in self._ns_ordered_prefixes:
751                if prefix:
752                    a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
753                                     XMLNS_NAMESPACE, prefix, "xmlns")
754                else:
755                    a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
756                                     "xmlns", EMPTY_PREFIX)
757                a.value = uri
758                a.ownerDocument = self.document
759                _set_attribute_node(node, a)
760            del self._ns_ordered_prefixes[:]
761
762        if attributes:
763            node._ensure_attributes()
764            _attrs = node._attrs
765            _attrsNS = node._attrsNS
766            for i in range(0, len(attributes), 2):
767                aname = attributes[i]
768                value = attributes[i+1]
769                if ' ' in aname:
770                    uri, localname, prefix, qname = _parse_ns_name(self, aname)
771                    a = minidom.Attr(qname, uri, localname, prefix)
772                    _attrs[qname] = a
773                    _attrsNS[(uri, localname)] = a
774                else:
775                    a = minidom.Attr(aname, EMPTY_NAMESPACE,
776                                     aname, EMPTY_PREFIX)
777                    _attrs[aname] = a
778                    _attrsNS[(EMPTY_NAMESPACE, aname)] = a
779                a.ownerDocument = self.document
780                a.value = value
781                a.ownerElement = node
782
783    if __debug__:
784        # This only adds some asserts to the original
785        # end_element_handler(), so we only define this when -O is not
786        # used.  If changing one, be sure to check the other to see if
787        # it needs to be changed as well.
788        #
789        def end_element_handler(self, name):
790            curNode = self.curNode
791            if ' ' in name:
792                uri, localname, prefix, qname = _parse_ns_name(self, name)
793                assert (curNode.namespaceURI == uri
794                        and curNode.localName == localname
795                        and curNode.prefix == prefix), \
796                        "element stack messed up! (namespace)"
797            else:
798                assert curNode.nodeName == name, \
799                       "element stack messed up - bad nodeName"
800                assert curNode.namespaceURI == EMPTY_NAMESPACE, \
801                       "element stack messed up - bad namespaceURI"
802            self.curNode = curNode.parentNode
803            self._finish_end_element(curNode)
804
805
806class ExpatBuilderNS(Namespaces, ExpatBuilder):
807    """Document builder that supports namespaces."""
808
809    def reset(self):
810        ExpatBuilder.reset(self)
811        self._initNamespaces()
812
813
814class FragmentBuilderNS(Namespaces, FragmentBuilder):
815    """Fragment builder that supports namespaces."""
816
817    def reset(self):
818        FragmentBuilder.reset(self)
819        self._initNamespaces()
820
821    def _getNSattrs(self):
822        """Return string of namespace attributes from this element and
823        ancestors."""
824        # XXX This needs to be re-written to walk the ancestors of the
825        # context to build up the namespace information from
826        # declarations, elements, and attributes found in context.
827        # Otherwise we have to store a bunch more data on the DOM
828        # (though that *might* be more reliable -- not clear).
829        attrs = ""
830        context = self.context
831        L = []
832        while context:
833            if hasattr(context, '_ns_prefix_uri'):
834                for prefix, uri in context._ns_prefix_uri.items():
835                    # add every new NS decl from context to L and attrs string
836                    if prefix in L:
837                        continue
838                    L.append(prefix)
839                    if prefix:
840                        declname = "xmlns:" + prefix
841                    else:
842                        declname = "xmlns"
843                    if attrs:
844                        attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
845                    else:
846                        attrs = " %s='%s'" % (declname, uri)
847            context = context.parentNode
848        return attrs
849
850
851class ParseEscape(Exception):
852    """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
853    pass
854
855class InternalSubsetExtractor(ExpatBuilder):
856    """XML processor which can rip out the internal document type subset."""
857
858    subset = None
859
860    def getSubset(self):
861        """Return the internal subset as a string."""
862        return self.subset
863
864    def parseFile(self, file):
865        try:
866            ExpatBuilder.parseFile(self, file)
867        except ParseEscape:
868            pass
869
870    def parseString(self, string):
871        try:
872            ExpatBuilder.parseString(self, string)
873        except ParseEscape:
874            pass
875
876    def install(self, parser):
877        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
878        parser.StartElementHandler = self.start_element_handler
879
880    def start_doctype_decl_handler(self, name, publicId, systemId,
881                                   has_internal_subset):
882        if has_internal_subset:
883            parser = self.getParser()
884            self.subset = []
885            parser.DefaultHandler = self.subset.append
886            parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
887        else:
888            raise ParseEscape()
889
890    def end_doctype_decl_handler(self):
891        s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
892        self.subset = s
893        raise ParseEscape()
894
895    def start_element_handler(self, name, attrs):
896        raise ParseEscape()
897
898
899def parse(file, namespaces=True):
900    """Parse a document, returning the resulting Document node.
901
902    'file' may be either a file name or an open file object.
903    """
904    if namespaces:
905        builder = ExpatBuilderNS()
906    else:
907        builder = ExpatBuilder()
908
909    if isinstance(file, str):
910        with open(file, 'rb') as fp:
911            result = builder.parseFile(fp)
912    else:
913        result = builder.parseFile(file)
914    return result
915
916
917def parseString(string, namespaces=True):
918    """Parse a document from a string, returning the resulting
919    Document node.
920    """
921    if namespaces:
922        builder = ExpatBuilderNS()
923    else:
924        builder = ExpatBuilder()
925    return builder.parseString(string)
926
927
928def parseFragment(file, context, namespaces=True):
929    """Parse a fragment of a document, given the context from which it
930    was originally extracted.  context should be the parent of the
931    node(s) which are in the fragment.
932
933    'file' may be either a file name or an open file object.
934    """
935    if namespaces:
936        builder = FragmentBuilderNS(context)
937    else:
938        builder = FragmentBuilder(context)
939
940    if isinstance(file, str):
941        with open(file, 'rb') as fp:
942            result = builder.parseFile(fp)
943    else:
944        result = builder.parseFile(file)
945    return result
946
947
948def parseFragmentString(string, context, namespaces=True):
949    """Parse a fragment of a document from a string, given the context
950    from which it was originally extracted.  context should be the
951    parent of the node(s) which are in the fragment.
952    """
953    if namespaces:
954        builder = FragmentBuilderNS(context)
955    else:
956        builder = FragmentBuilder(context)
957    return builder.parseString(string)
958
959
960def makeBuilder(options):
961    """Create a builder based on an Options object."""
962    if options.namespaces:
963        return ExpatBuilderNS(options)
964    else:
965        return ExpatBuilder(options)
966