• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Facility to use the Expat parser to load a minidom instance
2from a string or file.
3
4This avoids all the overhead of SAX and pulldom to gain performance.
5"""
6
7# Warning!
8#
9# This module is tightly bound to the implementation details of the
10# minidom DOM and can't be used with other DOM implementations.  This
11# is due, in part, to a lack of appropriate methods in the DOM (there is
12# no way to create Entity and Notation nodes via the DOM Level 2
13# interface), and for performance.  The latter is the cause of some fairly
14# cryptic code.
15#
16# Performance hacks:
17#
18#   -  .character_data_handler() has an extra case in which continuing
19#      data is appended to an existing Text node; this can be a
20#      speedup since pyexpat can break up character data into multiple
21#      callbacks even though we set the buffer_text attribute on the
22#      parser.  This also gives us the advantage that we don't need a
23#      separate normalization pass.
24#
25#   -  Determining that a node exists is done using an identity comparison
26#      with None rather than a truth test; this avoids searching for and
27#      calling any methods on the node object if it exists.  (A rather
28#      nice speedup is achieved this way as well!)
29
30from xml.dom import xmlbuilder, minidom, Node
31from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32from xml.parsers import expat
33from xml.dom.minidom import _append_child, _set_attribute_node
34from xml.dom.NodeFilter import NodeFilter
35
36from xml.dom.minicompat import *
37
38TEXT_NODE = Node.TEXT_NODE
39CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
40DOCUMENT_NODE = Node.DOCUMENT_NODE
41
42FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
43FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
44FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
45FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
46
47theDOMImplementation = minidom.getDOMImplementation()
48
49# Expat typename -> TypeInfo
50_typeinfo_map = {
51    "CDATA":    minidom.TypeInfo(None, "cdata"),
52    "ENUM":     minidom.TypeInfo(None, "enumeration"),
53    "ENTITY":   minidom.TypeInfo(None, "entity"),
54    "ENTITIES": minidom.TypeInfo(None, "entities"),
55    "ID":       minidom.TypeInfo(None, "id"),
56    "IDREF":    minidom.TypeInfo(None, "idref"),
57    "IDREFS":   minidom.TypeInfo(None, "idrefs"),
58    "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
59    "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
60    }
61
62class ElementInfo(object):
63    __slots__ = '_attr_info', '_model', 'tagName'
64
65    def __init__(self, tagName, model=None):
66        self.tagName = tagName
67        self._attr_info = []
68        self._model = model
69
70    def __getstate__(self):
71        return self._attr_info, self._model, self.tagName
72
73    def __setstate__(self, state):
74        self._attr_info, self._model, self.tagName = state
75
76    def getAttributeType(self, aname):
77        for info in self._attr_info:
78            if info[1] == aname:
79                t = info[-2]
80                if t[0] == "(":
81                    return _typeinfo_map["ENUM"]
82                else:
83                    return _typeinfo_map[info[-2]]
84        return minidom._no_type
85
86    def getAttributeTypeNS(self, namespaceURI, localName):
87        return minidom._no_type
88
89    def isElementContent(self):
90        if self._model:
91            type = self._model[0]
92            return type not in (expat.model.XML_CTYPE_ANY,
93                                expat.model.XML_CTYPE_MIXED)
94        else:
95            return False
96
97    def isEmpty(self):
98        if self._model:
99            return self._model[0] == expat.model.XML_CTYPE_EMPTY
100        else:
101            return False
102
103    def isId(self, aname):
104        for info in self._attr_info:
105            if info[1] == aname:
106                return info[-2] == "ID"
107        return False
108
109    def isIdNS(self, euri, ename, auri, aname):
110        # not sure this is meaningful
111        return self.isId((auri, aname))
112
113def _intern(builder, s):
114    return builder._intern_setdefault(s, s)
115
116def _parse_ns_name(builder, name):
117    assert ' ' in name
118    parts = name.split(' ')
119    intern = builder._intern_setdefault
120    if len(parts) == 3:
121        uri, localname, prefix = parts
122        prefix = intern(prefix, prefix)
123        qname = "%s:%s" % (prefix, localname)
124        qname = intern(qname, qname)
125        localname = intern(localname, localname)
126    else:
127        uri, localname = parts
128        prefix = EMPTY_PREFIX
129        qname = localname = intern(localname, localname)
130    return intern(uri, uri), localname, prefix, qname
131
132
133class ExpatBuilder:
134    """Document builder that uses Expat to build a ParsedXML.DOM document
135    instance."""
136
137    def __init__(self, options=None):
138        if options is None:
139            options = xmlbuilder.Options()
140        self._options = options
141        if self._options.filter is not None:
142            self._filter = FilterVisibilityController(self._options.filter)
143        else:
144            self._filter = None
145            # This *really* doesn't do anything in this case, so
146            # override it with something fast & minimal.
147            self._finish_start_element = id
148        self._parser = None
149        self.reset()
150
151    def createParser(self):
152        """Create a new parser object."""
153        return expat.ParserCreate()
154
155    def getParser(self):
156        """Return the parser object, creating a new one if needed."""
157        if not self._parser:
158            self._parser = self.createParser()
159            self._intern_setdefault = self._parser.intern.setdefault
160            self._parser.buffer_text = True
161            self._parser.ordered_attributes = True
162            self._parser.specified_attributes = True
163            self.install(self._parser)
164        return self._parser
165
166    def reset(self):
167        """Free all data structures used during DOM construction."""
168        self.document = theDOMImplementation.createDocument(
169            EMPTY_NAMESPACE, None, None)
170        self.curNode = self.document
171        self._elem_info = self.document._elem_info
172        self._cdata = False
173
174    def install(self, parser):
175        """Install the callbacks needed to build the DOM into the parser."""
176        # This creates circular references!
177        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178        parser.StartElementHandler = self.first_element_handler
179        parser.EndElementHandler = self.end_element_handler
180        parser.ProcessingInstructionHandler = self.pi_handler
181        if self._options.entities:
182            parser.EntityDeclHandler = self.entity_decl_handler
183        parser.NotationDeclHandler = self.notation_decl_handler
184        if self._options.comments:
185            parser.CommentHandler = self.comment_handler
186        if self._options.cdata_sections:
187            parser.StartCdataSectionHandler = self.start_cdata_section_handler
188            parser.EndCdataSectionHandler = self.end_cdata_section_handler
189            parser.CharacterDataHandler = self.character_data_handler_cdata
190        else:
191            parser.CharacterDataHandler = self.character_data_handler
192        parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193        parser.XmlDeclHandler = self.xml_decl_handler
194        parser.ElementDeclHandler = self.element_decl_handler
195        parser.AttlistDeclHandler = self.attlist_decl_handler
196
197    def parseFile(self, file):
198        """Parse a document from a file object, returning the document
199        node."""
200        parser = self.getParser()
201        first_buffer = True
202        try:
203            while 1:
204                buffer = file.read(16*1024)
205                if not buffer:
206                    break
207                parser.Parse(buffer, 0)
208                if first_buffer and self.document.documentElement:
209                    self._setup_subset(buffer)
210                first_buffer = False
211            parser.Parse("", True)
212        except ParseEscape:
213            pass
214        doc = self.document
215        self.reset()
216        self._parser = None
217        return doc
218
219    def parseString(self, string):
220        """Parse a document from a string, returning the document node."""
221        parser = self.getParser()
222        try:
223            parser.Parse(string, True)
224            self._setup_subset(string)
225        except ParseEscape:
226            pass
227        doc = self.document
228        self.reset()
229        self._parser = None
230        return doc
231
232    def _setup_subset(self, buffer):
233        """Load the internal subset if there might be one."""
234        if self.document.doctype:
235            extractor = InternalSubsetExtractor()
236            extractor.parseString(buffer)
237            subset = extractor.getSubset()
238            self.document.doctype.internalSubset = subset
239
240    def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241                                   has_internal_subset):
242        doctype = self.document.implementation.createDocumentType(
243            doctypeName, publicId, systemId)
244        doctype.ownerDocument = self.document
245        _append_child(self.document, doctype)
246        self.document.doctype = doctype
247        if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248            self.document.doctype = None
249            del self.document.childNodes[-1]
250            doctype = None
251            self._parser.EntityDeclHandler = None
252            self._parser.NotationDeclHandler = None
253        if has_internal_subset:
254            if doctype is not None:
255                doctype.entities._seq = []
256                doctype.notations._seq = []
257            self._parser.CommentHandler = None
258            self._parser.ProcessingInstructionHandler = None
259            self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260
261    def end_doctype_decl_handler(self):
262        if self._options.comments:
263            self._parser.CommentHandler = self.comment_handler
264        self._parser.ProcessingInstructionHandler = self.pi_handler
265        if not (self._elem_info or self._filter):
266            self._finish_end_element = id
267
268    def pi_handler(self, target, data):
269        node = self.document.createProcessingInstruction(target, data)
270        _append_child(self.curNode, node)
271        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272            self.curNode.removeChild(node)
273
274    def character_data_handler_cdata(self, data):
275        childNodes = self.curNode.childNodes
276        if self._cdata:
277            if (  self._cdata_continue
278                  and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279                childNodes[-1].appendData(data)
280                return
281            node = self.document.createCDATASection(data)
282            self._cdata_continue = True
283        elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284            node = childNodes[-1]
285            value = node.data + data
286            d = node.__dict__
287            d['data'] = d['nodeValue'] = value
288            return
289        else:
290            node = minidom.Text()
291            d = node.__dict__
292            d['data'] = d['nodeValue'] = data
293            d['ownerDocument'] = self.document
294        _append_child(self.curNode, node)
295
296    def character_data_handler(self, data):
297        childNodes = self.curNode.childNodes
298        if childNodes and childNodes[-1].nodeType == TEXT_NODE:
299            node = childNodes[-1]
300            d = node.__dict__
301            d['data'] = d['nodeValue'] = node.data + data
302            return
303        node = minidom.Text()
304        d = node.__dict__
305        d['data'] = d['nodeValue'] = node.data + data
306        d['ownerDocument'] = self.document
307        _append_child(self.curNode, node)
308
309    def entity_decl_handler(self, entityName, is_parameter_entity, value,
310                            base, systemId, publicId, notationName):
311        if is_parameter_entity:
312            # we don't care about parameter entities for the DOM
313            return
314        if not self._options.entities:
315            return
316        node = self.document._create_entity(entityName, publicId,
317                                            systemId, notationName)
318        if value is not None:
319            # internal entity
320            # node *should* be readonly, but we'll cheat
321            child = self.document.createTextNode(value)
322            node.childNodes.append(child)
323        self.document.doctype.entities._seq.append(node)
324        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
325            del self.document.doctype.entities._seq[-1]
326
327    def notation_decl_handler(self, notationName, base, systemId, publicId):
328        node = self.document._create_notation(notationName, publicId, systemId)
329        self.document.doctype.notations._seq.append(node)
330        if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
331            del self.document.doctype.notations._seq[-1]
332
333    def comment_handler(self, data):
334        node = self.document.createComment(data)
335        _append_child(self.curNode, node)
336        if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
337            self.curNode.removeChild(node)
338
339    def start_cdata_section_handler(self):
340        self._cdata = True
341        self._cdata_continue = False
342
343    def end_cdata_section_handler(self):
344        self._cdata = False
345        self._cdata_continue = False
346
347    def external_entity_ref_handler(self, context, base, systemId, publicId):
348        return 1
349
350    def first_element_handler(self, name, attributes):
351        if self._filter is None and not self._elem_info:
352            self._finish_end_element = id
353        self.getParser().StartElementHandler = self.start_element_handler
354        self.start_element_handler(name, attributes)
355
356    def start_element_handler(self, name, attributes):
357        node = self.document.createElement(name)
358        _append_child(self.curNode, node)
359        self.curNode = node
360
361        if attributes:
362            for i in range(0, len(attributes), 2):
363                a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
364                                 None, EMPTY_PREFIX)
365                value = attributes[i+1]
366                d = a.childNodes[0].__dict__
367                d['data'] = d['nodeValue'] = value
368                d = a.__dict__
369                d['value'] = d['nodeValue'] = value
370                d['ownerDocument'] = self.document
371                _set_attribute_node(node, a)
372
373        if node is not self.document.documentElement:
374            self._finish_start_element(node)
375
376    def _finish_start_element(self, node):
377        if self._filter:
378            # To be general, we'd have to call isSameNode(), but this
379            # is sufficient for minidom:
380            if node is self.document.documentElement:
381                return
382            filt = self._filter.startContainer(node)
383            if filt == FILTER_REJECT:
384                # ignore this node & all descendents
385                Rejecter(self)
386            elif filt == FILTER_SKIP:
387                # ignore this node, but make it's children become
388                # children of the parent node
389                Skipper(self)
390            else:
391                return
392            self.curNode = node.parentNode
393            node.parentNode.removeChild(node)
394            node.unlink()
395
396    # If this ever changes, Namespaces.end_element_handler() needs to
397    # be changed to match.
398    #
399    def end_element_handler(self, name):
400        curNode = self.curNode
401        self.curNode = curNode.parentNode
402        self._finish_end_element(curNode)
403
404    def _finish_end_element(self, curNode):
405        info = self._elem_info.get(curNode.tagName)
406        if info:
407            self._handle_white_text_nodes(curNode, info)
408        if self._filter:
409            if curNode is self.document.documentElement:
410                return
411            if self._filter.acceptNode(curNode) == FILTER_REJECT:
412                self.curNode.removeChild(curNode)
413                curNode.unlink()
414
415    def _handle_white_text_nodes(self, node, info):
416        if (self._options.whitespace_in_element_content
417            or not info.isElementContent()):
418            return
419
420        # We have element type information and should remove ignorable
421        # whitespace; identify for text nodes which contain only
422        # whitespace.
423        L = []
424        for child in node.childNodes:
425            if child.nodeType == TEXT_NODE and not child.data.strip():
426                L.append(child)
427
428        # Remove ignorable whitespace from the tree.
429        for child in L:
430            node.removeChild(child)
431
432    def element_decl_handler(self, name, model):
433        info = self._elem_info.get(name)
434        if info is None:
435            self._elem_info[name] = ElementInfo(name, model)
436        else:
437            assert info._model is None
438            info._model = model
439
440    def attlist_decl_handler(self, elem, name, type, default, required):
441        info = self._elem_info.get(elem)
442        if info is None:
443            info = ElementInfo(elem)
444            self._elem_info[elem] = info
445        info._attr_info.append(
446            [None, name, None, None, default, 0, type, required])
447
448    def xml_decl_handler(self, version, encoding, standalone):
449        self.document.version = version
450        self.document.encoding = encoding
451        # This is still a little ugly, thanks to the pyexpat API. ;-(
452        if standalone >= 0:
453            if standalone:
454                self.document.standalone = True
455            else:
456                self.document.standalone = False
457
458
459# Don't include FILTER_INTERRUPT, since that's checked separately
460# where allowed.
461_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
462
463class FilterVisibilityController(object):
464    """Wrapper around a DOMBuilderFilter which implements the checks
465    to make the whatToShow filter attribute work."""
466
467    __slots__ = 'filter',
468
469    def __init__(self, filter):
470        self.filter = filter
471
472    def startContainer(self, node):
473        mask = self._nodetype_mask[node.nodeType]
474        if self.filter.whatToShow & mask:
475            val = self.filter.startContainer(node)
476            if val == FILTER_INTERRUPT:
477                raise ParseEscape
478            if val not in _ALLOWED_FILTER_RETURNS:
479                raise ValueError, \
480                      "startContainer() returned illegal value: " + repr(val)
481            return val
482        else:
483            return FILTER_ACCEPT
484
485    def acceptNode(self, node):
486        mask = self._nodetype_mask[node.nodeType]
487        if self.filter.whatToShow & mask:
488            val = self.filter.acceptNode(node)
489            if val == FILTER_INTERRUPT:
490                raise ParseEscape
491            if val == FILTER_SKIP:
492                # move all child nodes to the parent, and remove this node
493                parent = node.parentNode
494                for child in node.childNodes[:]:
495                    parent.appendChild(child)
496                # node is handled by the caller
497                return FILTER_REJECT
498            if val not in _ALLOWED_FILTER_RETURNS:
499                raise ValueError, \
500                      "acceptNode() returned illegal value: " + repr(val)
501            return val
502        else:
503            return FILTER_ACCEPT
504
505    _nodetype_mask = {
506        Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
507        Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
508        Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
509        Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
510        Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
511        Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
512        Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
513        Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
514        Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
515        Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
516        Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
517        Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
518        }
519
520
521class FilterCrutch(object):
522    __slots__ = '_builder', '_level', '_old_start', '_old_end'
523
524    def __init__(self, builder):
525        self._level = 0
526        self._builder = builder
527        parser = builder._parser
528        self._old_start = parser.StartElementHandler
529        self._old_end = parser.EndElementHandler
530        parser.StartElementHandler = self.start_element_handler
531        parser.EndElementHandler = self.end_element_handler
532
533class Rejecter(FilterCrutch):
534    __slots__ = ()
535
536    def __init__(self, builder):
537        FilterCrutch.__init__(self, builder)
538        parser = builder._parser
539        for name in ("ProcessingInstructionHandler",
540                     "CommentHandler",
541                     "CharacterDataHandler",
542                     "StartCdataSectionHandler",
543                     "EndCdataSectionHandler",
544                     "ExternalEntityRefHandler",
545                     ):
546            setattr(parser, name, None)
547
548    def start_element_handler(self, *args):
549        self._level = self._level + 1
550
551    def end_element_handler(self, *args):
552        if self._level == 0:
553            # restore the old handlers
554            parser = self._builder._parser
555            self._builder.install(parser)
556            parser.StartElementHandler = self._old_start
557            parser.EndElementHandler = self._old_end
558        else:
559            self._level = self._level - 1
560
561class Skipper(FilterCrutch):
562    __slots__ = ()
563
564    def start_element_handler(self, *args):
565        node = self._builder.curNode
566        self._old_start(*args)
567        if self._builder.curNode is not node:
568            self._level = self._level + 1
569
570    def end_element_handler(self, *args):
571        if self._level == 0:
572            # We're popping back out of the node we're skipping, so we
573            # shouldn't need to do anything but reset the handlers.
574            self._builder._parser.StartElementHandler = self._old_start
575            self._builder._parser.EndElementHandler = self._old_end
576            self._builder = None
577        else:
578            self._level = self._level - 1
579            self._old_end(*args)
580
581
582# framework document used by the fragment builder.
583# Takes a string for the doctype, subset string, and namespace attrs string.
584
585_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
586    "http://xml.python.org/entities/fragment-builder/internal"
587
588_FRAGMENT_BUILDER_TEMPLATE = (
589    '''\
590<!DOCTYPE wrapper
591  %%s [
592  <!ENTITY fragment-builder-internal
593    SYSTEM "%s">
594%%s
595]>
596<wrapper %%s
597>&fragment-builder-internal;</wrapper>'''
598    % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
599
600
601class FragmentBuilder(ExpatBuilder):
602    """Builder which constructs document fragments given XML source
603    text and a context node.
604
605    The context node is expected to provide information about the
606    namespace declarations which are in scope at the start of the
607    fragment.
608    """
609
610    def __init__(self, context, options=None):
611        if context.nodeType == DOCUMENT_NODE:
612            self.originalDocument = context
613            self.context = context
614        else:
615            self.originalDocument = context.ownerDocument
616            self.context = context
617        ExpatBuilder.__init__(self, options)
618
619    def reset(self):
620        ExpatBuilder.reset(self)
621        self.fragment = None
622
623    def parseFile(self, file):
624        """Parse a document fragment from a file object, returning the
625        fragment node."""
626        return self.parseString(file.read())
627
628    def parseString(self, string):
629        """Parse a document fragment from a string, returning the
630        fragment node."""
631        self._source = string
632        parser = self.getParser()
633        doctype = self.originalDocument.doctype
634        ident = ""
635        if doctype:
636            subset = doctype.internalSubset or self._getDeclarations()
637            if doctype.publicId:
638                ident = ('PUBLIC "%s" "%s"'
639                         % (doctype.publicId, doctype.systemId))
640            elif doctype.systemId:
641                ident = 'SYSTEM "%s"' % doctype.systemId
642        else:
643            subset = ""
644        nsattrs = self._getNSattrs() # get ns decls from node's ancestors
645        document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
646        try:
647            parser.Parse(document, 1)
648        except:
649            self.reset()
650            raise
651        fragment = self.fragment
652        self.reset()
653##         self._parser = None
654        return fragment
655
656    def _getDeclarations(self):
657        """Re-create the internal subset from the DocumentType node.
658
659        This is only needed if we don't already have the
660        internalSubset as a string.
661        """
662        doctype = self.context.ownerDocument.doctype
663        s = ""
664        if doctype:
665            for i in range(doctype.notations.length):
666                notation = doctype.notations.item(i)
667                if s:
668                    s = s + "\n  "
669                s = "%s<!NOTATION %s" % (s, notation.nodeName)
670                if notation.publicId:
671                    s = '%s PUBLIC "%s"\n             "%s">' \
672                        % (s, notation.publicId, notation.systemId)
673                else:
674                    s = '%s SYSTEM "%s">' % (s, notation.systemId)
675            for i in range(doctype.entities.length):
676                entity = doctype.entities.item(i)
677                if s:
678                    s = s + "\n  "
679                s = "%s<!ENTITY %s" % (s, entity.nodeName)
680                if entity.publicId:
681                    s = '%s PUBLIC "%s"\n             "%s"' \
682                        % (s, entity.publicId, entity.systemId)
683                elif entity.systemId:
684                    s = '%s SYSTEM "%s"' % (s, entity.systemId)
685                else:
686                    s = '%s "%s"' % (s, entity.firstChild.data)
687                if entity.notationName:
688                    s = "%s NOTATION %s" % (s, entity.notationName)
689                s = s + ">"
690        return s
691
692    def _getNSattrs(self):
693        return ""
694
695    def external_entity_ref_handler(self, context, base, systemId, publicId):
696        if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
697            # this entref is the one that we made to put the subtree
698            # in; all of our given input is parsed in here.
699            old_document = self.document
700            old_cur_node = self.curNode
701            parser = self._parser.ExternalEntityParserCreate(context)
702            # put the real document back, parse into the fragment to return
703            self.document = self.originalDocument
704            self.fragment = self.document.createDocumentFragment()
705            self.curNode = self.fragment
706            try:
707                parser.Parse(self._source, 1)
708            finally:
709                self.curNode = old_cur_node
710                self.document = old_document
711                self._source = None
712            return -1
713        else:
714            return ExpatBuilder.external_entity_ref_handler(
715                self, context, base, systemId, publicId)
716
717
718class Namespaces:
719    """Mix-in class for builders; adds support for namespaces."""
720
721    def _initNamespaces(self):
722        # list of (prefix, uri) ns declarations.  Namespace attrs are
723        # constructed from this and added to the element's attrs.
724        self._ns_ordered_prefixes = []
725
726    def createParser(self):
727        """Create a new namespace-handling parser."""
728        parser = expat.ParserCreate(namespace_separator=" ")
729        parser.namespace_prefixes = True
730        return parser
731
732    def install(self, parser):
733        """Insert the namespace-handlers onto the parser."""
734        ExpatBuilder.install(self, parser)
735        if self._options.namespace_declarations:
736            parser.StartNamespaceDeclHandler = (
737                self.start_namespace_decl_handler)
738
739    def start_namespace_decl_handler(self, prefix, uri):
740        """Push this namespace declaration on our storage."""
741        self._ns_ordered_prefixes.append((prefix, uri))
742
743    def start_element_handler(self, name, attributes):
744        if ' ' in name:
745            uri, localname, prefix, qname = _parse_ns_name(self, name)
746        else:
747            uri = EMPTY_NAMESPACE
748            qname = name
749            localname = None
750            prefix = EMPTY_PREFIX
751        node = minidom.Element(qname, uri, prefix, localname)
752        node.ownerDocument = self.document
753        _append_child(self.curNode, node)
754        self.curNode = node
755
756        if self._ns_ordered_prefixes:
757            for prefix, uri in self._ns_ordered_prefixes:
758                if prefix:
759                    a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
760                                     XMLNS_NAMESPACE, prefix, "xmlns")
761                else:
762                    a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
763                                     "xmlns", EMPTY_PREFIX)
764                d = a.childNodes[0].__dict__
765                d['data'] = d['nodeValue'] = uri
766                d = a.__dict__
767                d['value'] = d['nodeValue'] = uri
768                d['ownerDocument'] = self.document
769                _set_attribute_node(node, a)
770            del self._ns_ordered_prefixes[:]
771
772        if attributes:
773            _attrs = node._attrs
774            _attrsNS = node._attrsNS
775            for i in range(0, len(attributes), 2):
776                aname = attributes[i]
777                value = attributes[i+1]
778                if ' ' in aname:
779                    uri, localname, prefix, qname = _parse_ns_name(self, aname)
780                    a = minidom.Attr(qname, uri, localname, prefix)
781                    _attrs[qname] = a
782                    _attrsNS[(uri, localname)] = a
783                else:
784                    a = minidom.Attr(aname, EMPTY_NAMESPACE,
785                                     aname, EMPTY_PREFIX)
786                    _attrs[aname] = a
787                    _attrsNS[(EMPTY_NAMESPACE, aname)] = a
788                d = a.childNodes[0].__dict__
789                d['data'] = d['nodeValue'] = value
790                d = a.__dict__
791                d['ownerDocument'] = self.document
792                d['value'] = d['nodeValue'] = value
793                d['ownerElement'] = node
794
795    if __debug__:
796        # This only adds some asserts to the original
797        # end_element_handler(), so we only define this when -O is not
798        # used.  If changing one, be sure to check the other to see if
799        # it needs to be changed as well.
800        #
801        def end_element_handler(self, name):
802            curNode = self.curNode
803            if ' ' in name:
804                uri, localname, prefix, qname = _parse_ns_name(self, name)
805                assert (curNode.namespaceURI == uri
806                        and curNode.localName == localname
807                        and curNode.prefix == prefix), \
808                        "element stack messed up! (namespace)"
809            else:
810                assert curNode.nodeName == name, \
811                       "element stack messed up - bad nodeName"
812                assert curNode.namespaceURI == EMPTY_NAMESPACE, \
813                       "element stack messed up - bad namespaceURI"
814            self.curNode = curNode.parentNode
815            self._finish_end_element(curNode)
816
817
818class ExpatBuilderNS(Namespaces, ExpatBuilder):
819    """Document builder that supports namespaces."""
820
821    def reset(self):
822        ExpatBuilder.reset(self)
823        self._initNamespaces()
824
825
826class FragmentBuilderNS(Namespaces, FragmentBuilder):
827    """Fragment builder that supports namespaces."""
828
829    def reset(self):
830        FragmentBuilder.reset(self)
831        self._initNamespaces()
832
833    def _getNSattrs(self):
834        """Return string of namespace attributes from this element and
835        ancestors."""
836        # XXX This needs to be re-written to walk the ancestors of the
837        # context to build up the namespace information from
838        # declarations, elements, and attributes found in context.
839        # Otherwise we have to store a bunch more data on the DOM
840        # (though that *might* be more reliable -- not clear).
841        attrs = ""
842        context = self.context
843        L = []
844        while context:
845            if hasattr(context, '_ns_prefix_uri'):
846                for prefix, uri in context._ns_prefix_uri.items():
847                    # add every new NS decl from context to L and attrs string
848                    if prefix in L:
849                        continue
850                    L.append(prefix)
851                    if prefix:
852                        declname = "xmlns:" + prefix
853                    else:
854                        declname = "xmlns"
855                    if attrs:
856                        attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
857                    else:
858                        attrs = " %s='%s'" % (declname, uri)
859            context = context.parentNode
860        return attrs
861
862
863class ParseEscape(Exception):
864    """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
865    pass
866
867class InternalSubsetExtractor(ExpatBuilder):
868    """XML processor which can rip out the internal document type subset."""
869
870    subset = None
871
872    def getSubset(self):
873        """Return the internal subset as a string."""
874        return self.subset
875
876    def parseFile(self, file):
877        try:
878            ExpatBuilder.parseFile(self, file)
879        except ParseEscape:
880            pass
881
882    def parseString(self, string):
883        try:
884            ExpatBuilder.parseString(self, string)
885        except ParseEscape:
886            pass
887
888    def install(self, parser):
889        parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
890        parser.StartElementHandler = self.start_element_handler
891
892    def start_doctype_decl_handler(self, name, publicId, systemId,
893                                   has_internal_subset):
894        if has_internal_subset:
895            parser = self.getParser()
896            self.subset = []
897            parser.DefaultHandler = self.subset.append
898            parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
899        else:
900            raise ParseEscape()
901
902    def end_doctype_decl_handler(self):
903        s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
904        self.subset = s
905        raise ParseEscape()
906
907    def start_element_handler(self, name, attrs):
908        raise ParseEscape()
909
910
911def parse(file, namespaces=True):
912    """Parse a document, returning the resulting Document node.
913
914    'file' may be either a file name or an open file object.
915    """
916    if namespaces:
917        builder = ExpatBuilderNS()
918    else:
919        builder = ExpatBuilder()
920
921    if isinstance(file, StringTypes):
922        fp = open(file, 'rb')
923        try:
924            result = builder.parseFile(fp)
925        finally:
926            fp.close()
927    else:
928        result = builder.parseFile(file)
929    return result
930
931
932def parseString(string, namespaces=True):
933    """Parse a document from a string, returning the resulting
934    Document node.
935    """
936    if namespaces:
937        builder = ExpatBuilderNS()
938    else:
939        builder = ExpatBuilder()
940    return builder.parseString(string)
941
942
943def parseFragment(file, context, namespaces=True):
944    """Parse a fragment of a document, given the context from which it
945    was originally extracted.  context should be the parent of the
946    node(s) which are in the fragment.
947
948    'file' may be either a file name or an open file object.
949    """
950    if namespaces:
951        builder = FragmentBuilderNS(context)
952    else:
953        builder = FragmentBuilder(context)
954
955    if isinstance(file, StringTypes):
956        fp = open(file, 'rb')
957        try:
958            result = builder.parseFile(fp)
959        finally:
960            fp.close()
961    else:
962        result = builder.parseFile(file)
963    return result
964
965
966def parseFragmentString(string, context, namespaces=True):
967    """Parse a fragment of a document from a string, given the context
968    from which it was originally extracted.  context should be the
969    parent of the node(s) which are in the fragment.
970    """
971    if namespaces:
972        builder = FragmentBuilderNS(context)
973    else:
974        builder = FragmentBuilder(context)
975    return builder.parseString(string)
976
977
978def makeBuilder(options):
979    """Create a builder based on an Options object."""
980    if options.namespaces:
981        return ExpatBuilderNS(options)
982    else:
983        return ExpatBuilder(options)
984