• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Facility to use the Expat parser to load a minidom instance
2from a string or file.
3
4This avoids all the overhead of SAX and pulldom to gain performance.
5"""
6
7# Warning!
8#
9# This module is tightly bound to the implementation details of the
10# minidom DOM and can't be used with other DOM implementations.  This
11# is due, in part, to a lack of appropriate methods in the DOM (there is
12# no way to create Entity and Notation nodes via the DOM Level 2
13# interface), and for performance.  The latter is the cause of some fairly
14# cryptic code.
15#
16# Performance hacks:
17#
18#   -  .character_data_handler() has an extra case in which continuing
19#      data is appended to an existing Text node; this can be a
20#      speedup since pyexpat can break up character data into multiple
21#      callbacks even though we set the buffer_text attribute on the
22#      parser.  This also gives us the advantage that we don't need a
23#      separate normalization pass.
24#
25#   -  Determining that a node exists is done using an identity comparison
26#      with None rather than a truth test; this avoids searching for and
27#      calling any methods on the node object if it exists.  (A rather
28#      nice speedup is achieved this way as well!)
29
30from xml.dom import xmlbuilder, minidom, Node
31from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32from xml.parsers import expat
33from xml.dom.minidom import _append_child, _set_attribute_node
34from xml.dom.NodeFilter import NodeFilter
35
36TEXT_NODE = Node.TEXT_NODE
37CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38DOCUMENT_NODE = Node.DOCUMENT_NODE
39
40FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44
45theDOMImplementation = minidom.getDOMImplementation()
46
47# Expat typename -> TypeInfo
48_typeinfo_map = {
49    "CDATA":    minidom.TypeInfo(None, "cdata"),
50    "ENUM":     minidom.TypeInfo(None, "enumeration"),
51    "ENTITY":   minidom.TypeInfo(None, "entity"),
52    "ENTITIES": minidom.TypeInfo(None, "entities"),
53    "ID":       minidom.TypeInfo(None, "id"),
54    "IDREF":    minidom.TypeInfo(None, "idref"),
55    "IDREFS":   minidom.TypeInfo(None, "idrefs"),
56    "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
57    "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58    }
59
60class ElementInfo(object):
61    __slots__ = '_attr_info', '_model', 'tagName'
62
63    def __init__(self, tagName, model=None):
64        self.tagName = tagName
65        self._attr_info = []
66        self._model = model
67
68    def __getstate__(self):
69        return self._attr_info, self._model, self.tagName
70
71    def __setstate__(self, state):
72        self._attr_info, self._model, self.tagName = state
73
74    def getAttributeType(self, aname):
75        for info in self._attr_info:
76            if info[1] == aname:
77                t = info[-2]
78                if t[0] == "(":
79                    return _typeinfo_map["ENUM"]
80                else:
81                    return _typeinfo_map[info[-2]]
82        return minidom._no_type
83
84    def getAttributeTypeNS(self, namespaceURI, localName):
85        return minidom._no_type
86
87    def isElementContent(self):
88        if self._model:
89            type = self._model[0]
90            return type not in (expat.model.XML_CTYPE_ANY,
91                                expat.model.XML_CTYPE_MIXED)
92        else:
93            return False
94
95    def isEmpty(self):
96        if self._model:
97            return self._model[0] == expat.model.XML_CTYPE_EMPTY
98        else:
99            return False
100
101    def isId(self, aname):
102        for info in self._attr_info:
103            if info[1] == aname:
104                return info[-2] == "ID"
105        return False
106
107    def isIdNS(self, euri, ename, auri, aname):
108        # not sure this is meaningful
109        return self.isId((auri, aname))
110
111def _intern(builder, s):
112    return builder._intern_setdefault(s, s)
113
114def _parse_ns_name(builder, name):
115    assert ' ' in name
116    parts = name.split(' ')
117    intern = builder._intern_setdefault
118    if len(parts) == 3:
119        uri, localname, prefix = parts
120        prefix = intern(prefix, prefix)
121        qname = "%s:%s" % (prefix, localname)
122        qname = intern(qname, qname)
123        localname = intern(localname, localname)
124    elif len(parts) == 2:
125        uri, localname = parts
126        prefix = EMPTY_PREFIX
127        qname = localname = intern(localname, localname)
128    else:
129        raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
130    return intern(uri, uri), localname, prefix, qname
131
132
133class ExpatBuilder:
134    """Document builder that uses Expat to build a ParsedXML.DOM document
135    instance."""
136
137    def __init__(self, options=None):
138        if options is None:
139            options = xmlbuilder.Options()
140        self._options = options
141        if self._options.filter is not None:
142            self._filter = FilterVisibilityController(self._options.filter)
143        else:
144            self._filter = None
145            # This *really* doesn't do anything in this case, so
146            # override it with something fast & minimal.
147            self._finish_start_element = id
148        self._parser = None
149        self.reset()
150
151    def createParser(self):
152        """Create a new parser object."""
153        return expat.ParserCreate()
154
155    def getParser(self):
156        """Return the parser object, creating a new one if needed."""
157        if not self._parser:
158            self._parser = self.createParser()
159            self._intern_setdefault = self._parser.intern.setdefault
160            self._parser.buffer_text = True
161            self._parser.ordered_attributes = True
162            self._parser.specified_attributes = True
163            self.install(self._parser)
164        return self._parser
165
166    def reset(self):
167        """Free all data structures used during DOM construction."""
168        self.document = theDOMImplementation.createDocument(
169            EMPTY_NAMESPACE, None, None)
170        self.curNode = self.document
171        self._elem_info = self.document._elem_info
172        self._cdata = False
173
174    def install(self, parser):
175        """Install the callbacks needed to build the DOM into the parser."""
176        # This creates circular references!
177        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178        parser.StartElementHandler = self.first_element_handler
179        parser.EndElementHandler = self.end_element_handler
180        parser.ProcessingInstructionHandler = self.pi_handler
181        if self._options.entities:
182            parser.EntityDeclHandler = self.entity_decl_handler
183        parser.NotationDeclHandler = self.notation_decl_handler
184        if self._options.comments:
185            parser.CommentHandler = self.comment_handler
186        if self._options.cdata_sections:
187            parser.StartCdataSectionHandler = self.start_cdata_section_handler
188            parser.EndCdataSectionHandler = self.end_cdata_section_handler
189            parser.CharacterDataHandler = self.character_data_handler_cdata
190        else:
191            parser.CharacterDataHandler = self.character_data_handler
192        parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193        parser.XmlDeclHandler = self.xml_decl_handler
194        parser.ElementDeclHandler = self.element_decl_handler
195        parser.AttlistDeclHandler = self.attlist_decl_handler
196
197    def parseFile(self, file):
198        """Parse a document from a file object, returning the document
199        node."""
200        parser = self.getParser()
201        first_buffer = True
202        try:
203            while buffer := file.read(16*1024):
204                parser.Parse(buffer, False)
205                if first_buffer and self.document.documentElement:
206                    self._setup_subset(buffer)
207                first_buffer = False
208            parser.Parse(b"", True)
209        except ParseEscape:
210            pass
211        doc = self.document
212        self.reset()
213        self._parser = None
214        return doc
215
216    def parseString(self, string):
217        """Parse a document from a string, returning the document node."""
218        parser = self.getParser()
219        try:
220            parser.Parse(string, True)
221            self._setup_subset(string)
222        except ParseEscape:
223            pass
224        doc = self.document
225        self.reset()
226        self._parser = None
227        return doc
228
229    def _setup_subset(self, buffer):
230        """Load the internal subset if there might be one."""
231        if self.document.doctype:
232            extractor = InternalSubsetExtractor()
233            extractor.parseString(buffer)
234            subset = extractor.getSubset()
235            self.document.doctype.internalSubset = subset
236
237    def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
238                                   has_internal_subset):
239        doctype = self.document.implementation.createDocumentType(
240            doctypeName, publicId, systemId)
241        doctype.ownerDocument = self.document
242        _append_child(self.document, doctype)
243        self.document.doctype = doctype
244        if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
245            self.document.doctype = None
246            del self.document.childNodes[-1]
247            doctype = None
248            self._parser.EntityDeclHandler = None
249            self._parser.NotationDeclHandler = None
250        if has_internal_subset:
251            if doctype is not None:
252                doctype.entities._seq = []
253                doctype.notations._seq = []
254            self._parser.CommentHandler = None
255            self._parser.ProcessingInstructionHandler = None
256            self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
257
258    def end_doctype_decl_handler(self):
259        if self._options.comments:
260            self._parser.CommentHandler = self.comment_handler
261        self._parser.ProcessingInstructionHandler = self.pi_handler
262        if not (self._elem_info or self._filter):
263            self._finish_end_element = id
264
265    def pi_handler(self, target, data):
266        node = self.document.createProcessingInstruction(target, data)
267        _append_child(self.curNode, node)
268        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
269            self.curNode.removeChild(node)
270
271    def character_data_handler_cdata(self, data):
272        childNodes = self.curNode.childNodes
273        if self._cdata:
274            if (  self._cdata_continue
275                  and childNodes[-1].nodeType == CDATA_SECTION_NODE):
276                childNodes[-1].appendData(data)
277                return
278            node = self.document.createCDATASection(data)
279            self._cdata_continue = True
280        elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
281            node = childNodes[-1]
282            value = node.data + data
283            node.data = value
284            return
285        else:
286            node = minidom.Text()
287            node.data = data
288            node.ownerDocument = self.document
289        _append_child(self.curNode, node)
290
291    def character_data_handler(self, data):
292        childNodes = self.curNode.childNodes
293        if childNodes and childNodes[-1].nodeType == TEXT_NODE:
294            node = childNodes[-1]
295            node.data = node.data + data
296            return
297        node = minidom.Text()
298        node.data = node.data + data
299        node.ownerDocument = self.document
300        _append_child(self.curNode, node)
301
302    def entity_decl_handler(self, entityName, is_parameter_entity, value,
303                            base, systemId, publicId, notationName):
304        if is_parameter_entity:
305            # we don't care about parameter entities for the DOM
306            return
307        if not self._options.entities:
308            return
309        node = self.document._create_entity(entityName, publicId,
310                                            systemId, notationName)
311        if value is not None:
312            # internal entity
313            # node *should* be readonly, but we'll cheat
314            child = self.document.createTextNode(value)
315            node.childNodes.append(child)
316        self.document.doctype.entities._seq.append(node)
317        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
318            del self.document.doctype.entities._seq[-1]
319
320    def notation_decl_handler(self, notationName, base, systemId, publicId):
321        node = self.document._create_notation(notationName, publicId, systemId)
322        self.document.doctype.notations._seq.append(node)
323        if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
324            del self.document.doctype.notations._seq[-1]
325
326    def comment_handler(self, data):
327        node = self.document.createComment(data)
328        _append_child(self.curNode, node)
329        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
330            self.curNode.removeChild(node)
331
332    def start_cdata_section_handler(self):
333        self._cdata = True
334        self._cdata_continue = False
335
336    def end_cdata_section_handler(self):
337        self._cdata = False
338        self._cdata_continue = False
339
340    def external_entity_ref_handler(self, context, base, systemId, publicId):
341        return 1
342
343    def first_element_handler(self, name, attributes):
344        if self._filter is None and not self._elem_info:
345            self._finish_end_element = id
346        self.getParser().StartElementHandler = self.start_element_handler
347        self.start_element_handler(name, attributes)
348
349    def start_element_handler(self, name, attributes):
350        node = self.document.createElement(name)
351        _append_child(self.curNode, node)
352        self.curNode = node
353
354        if attributes:
355            for i in range(0, len(attributes), 2):
356                a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
357                                 None, EMPTY_PREFIX)
358                value = attributes[i+1]
359                a.value = value
360                a.ownerDocument = self.document
361                _set_attribute_node(node, a)
362
363        if node is not self.document.documentElement:
364            self._finish_start_element(node)
365
366    def _finish_start_element(self, node):
367        if self._filter:
368            # To be general, we'd have to call isSameNode(), but this
369            # is sufficient for minidom:
370            if node is self.document.documentElement:
371                return
372            filt = self._filter.startContainer(node)
373            if filt == FILTER_REJECT:
374                # ignore this node & all descendents
375                Rejecter(self)
376            elif filt == FILTER_SKIP:
377                # ignore this node, but make it's children become
378                # children of the parent node
379                Skipper(self)
380            else:
381                return
382            self.curNode = node.parentNode
383            node.parentNode.removeChild(node)
384            node.unlink()
385
386    # If this ever changes, Namespaces.end_element_handler() needs to
387    # be changed to match.
388    #
389    def end_element_handler(self, name):
390        curNode = self.curNode
391        self.curNode = curNode.parentNode
392        self._finish_end_element(curNode)
393
394    def _finish_end_element(self, curNode):
395        info = self._elem_info.get(curNode.tagName)
396        if info:
397            self._handle_white_text_nodes(curNode, info)
398        if self._filter:
399            if curNode is self.document.documentElement:
400                return
401            if self._filter.acceptNode(curNode) == FILTER_REJECT:
402                self.curNode.removeChild(curNode)
403                curNode.unlink()
404
405    def _handle_white_text_nodes(self, node, info):
406        if (self._options.whitespace_in_element_content
407            or not info.isElementContent()):
408            return
409
410        # We have element type information and should remove ignorable
411        # whitespace; identify for text nodes which contain only
412        # whitespace.
413        L = []
414        for child in node.childNodes:
415            if child.nodeType == TEXT_NODE and not child.data.strip():
416                L.append(child)
417
418        # Remove ignorable whitespace from the tree.
419        for child in L:
420            node.removeChild(child)
421
422    def element_decl_handler(self, name, model):
423        info = self._elem_info.get(name)
424        if info is None:
425            self._elem_info[name] = ElementInfo(name, model)
426        else:
427            assert info._model is None
428            info._model = model
429
430    def attlist_decl_handler(self, elem, name, type, default, required):
431        info = self._elem_info.get(elem)
432        if info is None:
433            info = ElementInfo(elem)
434            self._elem_info[elem] = info
435        info._attr_info.append(
436            [None, name, None, None, default, 0, type, required])
437
438    def xml_decl_handler(self, version, encoding, standalone):
439        self.document.version = version
440        self.document.encoding = encoding
441        # This is still a little ugly, thanks to the pyexpat API. ;-(
442        if standalone >= 0:
443            if standalone:
444                self.document.standalone = True
445            else:
446                self.document.standalone = False
447
448
449# Don't include FILTER_INTERRUPT, since that's checked separately
450# where allowed.
451_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
452
453class FilterVisibilityController(object):
454    """Wrapper around a DOMBuilderFilter which implements the checks
455    to make the whatToShow filter attribute work."""
456
457    __slots__ = 'filter',
458
459    def __init__(self, filter):
460        self.filter = filter
461
462    def startContainer(self, node):
463        mask = self._nodetype_mask[node.nodeType]
464        if self.filter.whatToShow & mask:
465            val = self.filter.startContainer(node)
466            if val == FILTER_INTERRUPT:
467                raise ParseEscape
468            if val not in _ALLOWED_FILTER_RETURNS:
469                raise ValueError(
470                      "startContainer() returned illegal value: " + repr(val))
471            return val
472        else:
473            return FILTER_ACCEPT
474
475    def acceptNode(self, node):
476        mask = self._nodetype_mask[node.nodeType]
477        if self.filter.whatToShow & mask:
478            val = self.filter.acceptNode(node)
479            if val == FILTER_INTERRUPT:
480                raise ParseEscape
481            if val == FILTER_SKIP:
482                # move all child nodes to the parent, and remove this node
483                parent = node.parentNode
484                for child in node.childNodes[:]:
485                    parent.appendChild(child)
486                # node is handled by the caller
487                return FILTER_REJECT
488            if val not in _ALLOWED_FILTER_RETURNS:
489                raise ValueError(
490                      "acceptNode() returned illegal value: " + repr(val))
491            return val
492        else:
493            return FILTER_ACCEPT
494
495    _nodetype_mask = {
496        Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
497        Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
498        Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
499        Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
500        Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
501        Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
502        Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
503        Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
504        Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
505        Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
506        Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
507        Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
508        }
509
510
511class FilterCrutch(object):
512    __slots__ = '_builder', '_level', '_old_start', '_old_end'
513
514    def __init__(self, builder):
515        self._level = 0
516        self._builder = builder
517        parser = builder._parser
518        self._old_start = parser.StartElementHandler
519        self._old_end = parser.EndElementHandler
520        parser.StartElementHandler = self.start_element_handler
521        parser.EndElementHandler = self.end_element_handler
522
523class Rejecter(FilterCrutch):
524    __slots__ = ()
525
526    def __init__(self, builder):
527        FilterCrutch.__init__(self, builder)
528        parser = builder._parser
529        for name in ("ProcessingInstructionHandler",
530                     "CommentHandler",
531                     "CharacterDataHandler",
532                     "StartCdataSectionHandler",
533                     "EndCdataSectionHandler",
534                     "ExternalEntityRefHandler",
535                     ):
536            setattr(parser, name, None)
537
538    def start_element_handler(self, *args):
539        self._level = self._level + 1
540
541    def end_element_handler(self, *args):
542        if self._level == 0:
543            # restore the old handlers
544            parser = self._builder._parser
545            self._builder.install(parser)
546            parser.StartElementHandler = self._old_start
547            parser.EndElementHandler = self._old_end
548        else:
549            self._level = self._level - 1
550
551class Skipper(FilterCrutch):
552    __slots__ = ()
553
554    def start_element_handler(self, *args):
555        node = self._builder.curNode
556        self._old_start(*args)
557        if self._builder.curNode is not node:
558            self._level = self._level + 1
559
560    def end_element_handler(self, *args):
561        if self._level == 0:
562            # We're popping back out of the node we're skipping, so we
563            # shouldn't need to do anything but reset the handlers.
564            self._builder._parser.StartElementHandler = self._old_start
565            self._builder._parser.EndElementHandler = self._old_end
566            self._builder = None
567        else:
568            self._level = self._level - 1
569            self._old_end(*args)
570
571
572# framework document used by the fragment builder.
573# Takes a string for the doctype, subset string, and namespace attrs string.
574
575_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
576    "http://xml.python.org/entities/fragment-builder/internal"
577
578_FRAGMENT_BUILDER_TEMPLATE = (
579    '''\
580<!DOCTYPE wrapper
581  %%s [
582  <!ENTITY fragment-builder-internal
583    SYSTEM "%s">
584%%s
585]>
586<wrapper %%s
587>&fragment-builder-internal;</wrapper>'''
588    % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
589
590
591class FragmentBuilder(ExpatBuilder):
592    """Builder which constructs document fragments given XML source
593    text and a context node.
594
595    The context node is expected to provide information about the
596    namespace declarations which are in scope at the start of the
597    fragment.
598    """
599
600    def __init__(self, context, options=None):
601        if context.nodeType == DOCUMENT_NODE:
602            self.originalDocument = context
603            self.context = context
604        else:
605            self.originalDocument = context.ownerDocument
606            self.context = context
607        ExpatBuilder.__init__(self, options)
608
609    def reset(self):
610        ExpatBuilder.reset(self)
611        self.fragment = None
612
613    def parseFile(self, file):
614        """Parse a document fragment from a file object, returning the
615        fragment node."""
616        return self.parseString(file.read())
617
618    def parseString(self, string):
619        """Parse a document fragment from a string, returning the
620        fragment node."""
621        self._source = string
622        parser = self.getParser()
623        doctype = self.originalDocument.doctype
624        ident = ""
625        if doctype:
626            subset = doctype.internalSubset or self._getDeclarations()
627            if doctype.publicId:
628                ident = ('PUBLIC "%s" "%s"'
629                         % (doctype.publicId, doctype.systemId))
630            elif doctype.systemId:
631                ident = 'SYSTEM "%s"' % doctype.systemId
632        else:
633            subset = ""
634        nsattrs = self._getNSattrs() # get ns decls from node's ancestors
635        document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
636        try:
637            parser.Parse(document, True)
638        except:
639            self.reset()
640            raise
641        fragment = self.fragment
642        self.reset()
643##         self._parser = None
644        return fragment
645
646    def _getDeclarations(self):
647        """Re-create the internal subset from the DocumentType node.
648
649        This is only needed if we don't already have the
650        internalSubset as a string.
651        """
652        doctype = self.context.ownerDocument.doctype
653        s = ""
654        if doctype:
655            for i in range(doctype.notations.length):
656                notation = doctype.notations.item(i)
657                if s:
658                    s = s + "\n  "
659                s = "%s<!NOTATION %s" % (s, notation.nodeName)
660                if notation.publicId:
661                    s = '%s PUBLIC "%s"\n             "%s">' \
662                        % (s, notation.publicId, notation.systemId)
663                else:
664                    s = '%s SYSTEM "%s">' % (s, notation.systemId)
665            for i in range(doctype.entities.length):
666                entity = doctype.entities.item(i)
667                if s:
668                    s = s + "\n  "
669                s = "%s<!ENTITY %s" % (s, entity.nodeName)
670                if entity.publicId:
671                    s = '%s PUBLIC "%s"\n             "%s"' \
672                        % (s, entity.publicId, entity.systemId)
673                elif entity.systemId:
674                    s = '%s SYSTEM "%s"' % (s, entity.systemId)
675                else:
676                    s = '%s "%s"' % (s, entity.firstChild.data)
677                if entity.notationName:
678                    s = "%s NOTATION %s" % (s, entity.notationName)
679                s = s + ">"
680        return s
681
682    def _getNSattrs(self):
683        return ""
684
685    def external_entity_ref_handler(self, context, base, systemId, publicId):
686        if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
687            # this entref is the one that we made to put the subtree
688            # in; all of our given input is parsed in here.
689            old_document = self.document
690            old_cur_node = self.curNode
691            parser = self._parser.ExternalEntityParserCreate(context)
692            # put the real document back, parse into the fragment to return
693            self.document = self.originalDocument
694            self.fragment = self.document.createDocumentFragment()
695            self.curNode = self.fragment
696            try:
697                parser.Parse(self._source, True)
698            finally:
699                self.curNode = old_cur_node
700                self.document = old_document
701                self._source = None
702            return -1
703        else:
704            return ExpatBuilder.external_entity_ref_handler(
705                self, context, base, systemId, publicId)
706
707
708class Namespaces:
709    """Mix-in class for builders; adds support for namespaces."""
710
711    def _initNamespaces(self):
712        # list of (prefix, uri) ns declarations.  Namespace attrs are
713        # constructed from this and added to the element's attrs.
714        self._ns_ordered_prefixes = []
715
716    def createParser(self):
717        """Create a new namespace-handling parser."""
718        parser = expat.ParserCreate(namespace_separator=" ")
719        parser.namespace_prefixes = True
720        return parser
721
722    def install(self, parser):
723        """Insert the namespace-handlers onto the parser."""
724        ExpatBuilder.install(self, parser)
725        if self._options.namespace_declarations:
726            parser.StartNamespaceDeclHandler = (
727                self.start_namespace_decl_handler)
728
729    def start_namespace_decl_handler(self, prefix, uri):
730        """Push this namespace declaration on our storage."""
731        self._ns_ordered_prefixes.append((prefix, uri))
732
733    def start_element_handler(self, name, attributes):
734        if ' ' in name:
735            uri, localname, prefix, qname = _parse_ns_name(self, name)
736        else:
737            uri = EMPTY_NAMESPACE
738            qname = name
739            localname = None
740            prefix = EMPTY_PREFIX
741        node = minidom.Element(qname, uri, prefix, localname)
742        node.ownerDocument = self.document
743        _append_child(self.curNode, node)
744        self.curNode = node
745
746        if self._ns_ordered_prefixes:
747            for prefix, uri in self._ns_ordered_prefixes:
748                if prefix:
749                    a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
750                                     XMLNS_NAMESPACE, prefix, "xmlns")
751                else:
752                    a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
753                                     "xmlns", EMPTY_PREFIX)
754                a.value = uri
755                a.ownerDocument = self.document
756                _set_attribute_node(node, a)
757            del self._ns_ordered_prefixes[:]
758
759        if attributes:
760            node._ensure_attributes()
761            _attrs = node._attrs
762            _attrsNS = node._attrsNS
763            for i in range(0, len(attributes), 2):
764                aname = attributes[i]
765                value = attributes[i+1]
766                if ' ' in aname:
767                    uri, localname, prefix, qname = _parse_ns_name(self, aname)
768                    a = minidom.Attr(qname, uri, localname, prefix)
769                    _attrs[qname] = a
770                    _attrsNS[(uri, localname)] = a
771                else:
772                    a = minidom.Attr(aname, EMPTY_NAMESPACE,
773                                     aname, EMPTY_PREFIX)
774                    _attrs[aname] = a
775                    _attrsNS[(EMPTY_NAMESPACE, aname)] = a
776                a.ownerDocument = self.document
777                a.value = value
778                a.ownerElement = node
779
780    if __debug__:
781        # This only adds some asserts to the original
782        # end_element_handler(), so we only define this when -O is not
783        # used.  If changing one, be sure to check the other to see if
784        # it needs to be changed as well.
785        #
786        def end_element_handler(self, name):
787            curNode = self.curNode
788            if ' ' in name:
789                uri, localname, prefix, qname = _parse_ns_name(self, name)
790                assert (curNode.namespaceURI == uri
791                        and curNode.localName == localname
792                        and curNode.prefix == prefix), \
793                        "element stack messed up! (namespace)"
794            else:
795                assert curNode.nodeName == name, \
796                       "element stack messed up - bad nodeName"
797                assert curNode.namespaceURI == EMPTY_NAMESPACE, \
798                       "element stack messed up - bad namespaceURI"
799            self.curNode = curNode.parentNode
800            self._finish_end_element(curNode)
801
802
803class ExpatBuilderNS(Namespaces, ExpatBuilder):
804    """Document builder that supports namespaces."""
805
806    def reset(self):
807        ExpatBuilder.reset(self)
808        self._initNamespaces()
809
810
811class FragmentBuilderNS(Namespaces, FragmentBuilder):
812    """Fragment builder that supports namespaces."""
813
814    def reset(self):
815        FragmentBuilder.reset(self)
816        self._initNamespaces()
817
818    def _getNSattrs(self):
819        """Return string of namespace attributes from this element and
820        ancestors."""
821        # XXX This needs to be re-written to walk the ancestors of the
822        # context to build up the namespace information from
823        # declarations, elements, and attributes found in context.
824        # Otherwise we have to store a bunch more data on the DOM
825        # (though that *might* be more reliable -- not clear).
826        attrs = ""
827        context = self.context
828        L = []
829        while context:
830            if hasattr(context, '_ns_prefix_uri'):
831                for prefix, uri in context._ns_prefix_uri.items():
832                    # add every new NS decl from context to L and attrs string
833                    if prefix in L:
834                        continue
835                    L.append(prefix)
836                    if prefix:
837                        declname = "xmlns:" + prefix
838                    else:
839                        declname = "xmlns"
840                    if attrs:
841                        attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
842                    else:
843                        attrs = " %s='%s'" % (declname, uri)
844            context = context.parentNode
845        return attrs
846
847
848class ParseEscape(Exception):
849    """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
850    pass
851
852class InternalSubsetExtractor(ExpatBuilder):
853    """XML processor which can rip out the internal document type subset."""
854
855    subset = None
856
857    def getSubset(self):
858        """Return the internal subset as a string."""
859        return self.subset
860
861    def parseFile(self, file):
862        try:
863            ExpatBuilder.parseFile(self, file)
864        except ParseEscape:
865            pass
866
867    def parseString(self, string):
868        try:
869            ExpatBuilder.parseString(self, string)
870        except ParseEscape:
871            pass
872
873    def install(self, parser):
874        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
875        parser.StartElementHandler = self.start_element_handler
876
877    def start_doctype_decl_handler(self, name, publicId, systemId,
878                                   has_internal_subset):
879        if has_internal_subset:
880            parser = self.getParser()
881            self.subset = []
882            parser.DefaultHandler = self.subset.append
883            parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
884        else:
885            raise ParseEscape()
886
887    def end_doctype_decl_handler(self):
888        s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
889        self.subset = s
890        raise ParseEscape()
891
892    def start_element_handler(self, name, attrs):
893        raise ParseEscape()
894
895
896def parse(file, namespaces=True):
897    """Parse a document, returning the resulting Document node.
898
899    'file' may be either a file name or an open file object.
900    """
901    if namespaces:
902        builder = ExpatBuilderNS()
903    else:
904        builder = ExpatBuilder()
905
906    if isinstance(file, str):
907        with open(file, 'rb') as fp:
908            result = builder.parseFile(fp)
909    else:
910        result = builder.parseFile(file)
911    return result
912
913
914def parseString(string, namespaces=True):
915    """Parse a document from a string, returning the resulting
916    Document node.
917    """
918    if namespaces:
919        builder = ExpatBuilderNS()
920    else:
921        builder = ExpatBuilder()
922    return builder.parseString(string)
923
924
925def parseFragment(file, context, namespaces=True):
926    """Parse a fragment of a document, given the context from which it
927    was originally extracted.  context should be the parent of the
928    node(s) which are in the fragment.
929
930    'file' may be either a file name or an open file object.
931    """
932    if namespaces:
933        builder = FragmentBuilderNS(context)
934    else:
935        builder = FragmentBuilder(context)
936
937    if isinstance(file, str):
938        with open(file, 'rb') as fp:
939            result = builder.parseFile(fp)
940    else:
941        result = builder.parseFile(file)
942    return result
943
944
945def parseFragmentString(string, context, namespaces=True):
946    """Parse a fragment of a document from a string, given the context
947    from which it was originally extracted.  context should be the
948    parent of the node(s) which are in the fragment.
949    """
950    if namespaces:
951        builder = FragmentBuilderNS(context)
952    else:
953        builder = FragmentBuilder(context)
954    return builder.parseString(string)
955
956
957def makeBuilder(options):
958    """Create a builder based on an Options object."""
959    if options.namespaces:
960        return ExpatBuilderNS(options)
961    else:
962        return ExpatBuilder(options)
963