• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree.  This module has two classes for this purpose:
5
6    1. ElementTree represents the whole XML document as a tree and
7
8    2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level.  Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary.  Each Element has a number of properties associated with it:
17
18    'tag' - a string containing the element's name.
19
20    'attributes' - a Python dictionary storing the element's attributes.
21
22    'text' - a string containing the element's text content.
23
24    'tail' - an optional string containing text after the element's end tag.
25
26    And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
36#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See https://www.python.org/psf/license for licensing details.
39#
40# ElementTree
41# Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
42#
43# fredrik@pythonware.com
44# http://www.pythonware.com
45# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
48# Copyright (c) 1999-2008 by Fredrik Lundh
49#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74    # public symbols
75    "Comment",
76    "dump",
77    "Element", "ElementTree",
78    "fromstring", "fromstringlist",
79    "indent", "iselement", "iterparse",
80    "parse", "ParseError",
81    "PI", "ProcessingInstruction",
82    "QName",
83    "SubElement",
84    "tostring", "tostringlist",
85    "TreeBuilder",
86    "VERSION",
87    "XML", "XMLID",
88    "XMLParser", "XMLPullParser",
89    "register_namespace",
90    "canonicalize", "C14NWriterTarget",
91    ]
92
93VERSION = "1.3.0"
94
95import sys
96import re
97import warnings
98import io
99import collections
100import collections.abc
101import contextlib
102import weakref
103
104from . import ElementPath
105
106
107class ParseError(SyntaxError):
108    """An error when parsing an XML document.
109
110    In addition to its exception value, a ParseError contains
111    two extra attributes:
112        'code'     - the specific exception code
113        'position' - the line and column of the error
114
115    """
116    pass
117
118# --------------------------------------------------------------------
119
120
121def iselement(element):
122    """Return True if *element* appears to be an Element."""
123    return hasattr(element, 'tag')
124
125
126class Element:
127    """An XML element.
128
129    This class is the reference implementation of the Element interface.
130
131    An element's length is its number of subelements.  That means if you
132    want to check if an element is truly empty, you should check BOTH
133    its length AND its text attribute.
134
135    The element tag, attribute names, and attribute values can be either
136    bytes or strings.
137
138    *tag* is the element name.  *attrib* is an optional dictionary containing
139    element attributes. *extra* are additional element attributes given as
140    keyword arguments.
141
142    Example form:
143        <tag attrib>text<child/>...</tag>tail
144
145    """
146
147    tag = None
148    """The element's name."""
149
150    attrib = None
151    """Dictionary of the element's attributes."""
152
153    text = None
154    """
155    Text before first subelement. This is either a string or the value None.
156    Note that if there is no text, this attribute may be either
157    None or the empty string, depending on the parser.
158
159    """
160
161    tail = None
162    """
163    Text after this element's end tag, but before the next sibling element's
164    start tag.  This is either a string or the value None.  Note that if there
165    was no text, this attribute may be either None or an empty string,
166    depending on the parser.
167
168    """
169
170    def __init__(self, tag, attrib={}, **extra):
171        if not isinstance(attrib, dict):
172            raise TypeError("attrib must be dict, not %s" % (
173                attrib.__class__.__name__,))
174        self.tag = tag
175        self.attrib = {**attrib, **extra}
176        self._children = []
177
178    def __repr__(self):
179        return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
180
181    def makeelement(self, tag, attrib):
182        """Create a new element with the same type.
183
184        *tag* is a string containing the element name.
185        *attrib* is a dictionary containing the element attributes.
186
187        Do not call this method, use the SubElement factory function instead.
188
189        """
190        return self.__class__(tag, attrib)
191
192    def __copy__(self):
193        elem = self.makeelement(self.tag, self.attrib)
194        elem.text = self.text
195        elem.tail = self.tail
196        elem[:] = self
197        return elem
198
199    def __len__(self):
200        return len(self._children)
201
202    def __bool__(self):
203        warnings.warn(
204            "Testing an element's truth value will always return True in "
205            "future versions.  "
206            "Use specific 'len(elem)' or 'elem is not None' test instead.",
207            DeprecationWarning, stacklevel=2
208            )
209        return len(self._children) != 0 # emulate old behaviour, for now
210
211    def __getitem__(self, index):
212        return self._children[index]
213
214    def __setitem__(self, index, element):
215        if isinstance(index, slice):
216            for elt in element:
217                self._assert_is_element(elt)
218        else:
219            self._assert_is_element(element)
220        self._children[index] = element
221
222    def __delitem__(self, index):
223        del self._children[index]
224
225    def append(self, subelement):
226        """Add *subelement* to the end of this element.
227
228        The new element will appear in document order after the last existing
229        subelement (or directly after the text, if it's the first subelement),
230        but before the end tag for this element.
231
232        """
233        self._assert_is_element(subelement)
234        self._children.append(subelement)
235
236    def extend(self, elements):
237        """Append subelements from a sequence.
238
239        *elements* is a sequence with zero or more elements.
240
241        """
242        for element in elements:
243            self._assert_is_element(element)
244            self._children.append(element)
245
246    def insert(self, index, subelement):
247        """Insert *subelement* at position *index*."""
248        self._assert_is_element(subelement)
249        self._children.insert(index, subelement)
250
251    def _assert_is_element(self, e):
252        # Need to refer to the actual Python implementation, not the
253        # shadowing C implementation.
254        if not isinstance(e, _Element_Py):
255            raise TypeError('expected an Element, not %s' % type(e).__name__)
256
257    def remove(self, subelement):
258        """Remove matching subelement.
259
260        Unlike the find methods, this method compares elements based on
261        identity, NOT ON tag value or contents.  To remove subelements by
262        other means, the easiest way is to use a list comprehension to
263        select what elements to keep, and then use slice assignment to update
264        the parent element.
265
266        ValueError is raised if a matching element could not be found.
267
268        """
269        # assert iselement(element)
270        self._children.remove(subelement)
271
272    def find(self, path, namespaces=None):
273        """Find first matching element by tag name or path.
274
275        *path* is a string having either an element tag or an XPath,
276        *namespaces* is an optional mapping from namespace prefix to full name.
277
278        Return the first matching element, or None if no element was found.
279
280        """
281        return ElementPath.find(self, path, namespaces)
282
283    def findtext(self, path, default=None, namespaces=None):
284        """Find text for first matching element by tag name or path.
285
286        *path* is a string having either an element tag or an XPath,
287        *default* is the value to return if the element was not found,
288        *namespaces* is an optional mapping from namespace prefix to full name.
289
290        Return text content of first matching element, or default value if
291        none was found.  Note that if an element is found having no text
292        content, the empty string is returned.
293
294        """
295        return ElementPath.findtext(self, path, default, namespaces)
296
297    def findall(self, path, namespaces=None):
298        """Find all matching subelements by tag name or path.
299
300        *path* is a string having either an element tag or an XPath,
301        *namespaces* is an optional mapping from namespace prefix to full name.
302
303        Returns list containing all matching elements in document order.
304
305        """
306        return ElementPath.findall(self, path, namespaces)
307
308    def iterfind(self, path, namespaces=None):
309        """Find all matching subelements by tag name or path.
310
311        *path* is a string having either an element tag or an XPath,
312        *namespaces* is an optional mapping from namespace prefix to full name.
313
314        Return an iterable yielding all matching elements in document order.
315
316        """
317        return ElementPath.iterfind(self, path, namespaces)
318
319    def clear(self):
320        """Reset element.
321
322        This function removes all subelements, clears all attributes, and sets
323        the text and tail attributes to None.
324
325        """
326        self.attrib.clear()
327        self._children = []
328        self.text = self.tail = None
329
330    def get(self, key, default=None):
331        """Get element attribute.
332
333        Equivalent to attrib.get, but some implementations may handle this a
334        bit more efficiently.  *key* is what attribute to look for, and
335        *default* is what to return if the attribute was not found.
336
337        Returns a string containing the attribute value, or the default if
338        attribute was not found.
339
340        """
341        return self.attrib.get(key, default)
342
343    def set(self, key, value):
344        """Set element attribute.
345
346        Equivalent to attrib[key] = value, but some implementations may handle
347        this a bit more efficiently.  *key* is what attribute to set, and
348        *value* is the attribute value to set it to.
349
350        """
351        self.attrib[key] = value
352
353    def keys(self):
354        """Get list of attribute names.
355
356        Names are returned in an arbitrary order, just like an ordinary
357        Python dict.  Equivalent to attrib.keys()
358
359        """
360        return self.attrib.keys()
361
362    def items(self):
363        """Get element attributes as a sequence.
364
365        The attributes are returned in arbitrary order.  Equivalent to
366        attrib.items().
367
368        Return a list of (name, value) tuples.
369
370        """
371        return self.attrib.items()
372
373    def iter(self, tag=None):
374        """Create tree iterator.
375
376        The iterator loops over the element and all subelements in document
377        order, returning all elements with a matching tag.
378
379        If the tree structure is modified during iteration, new or removed
380        elements may or may not be included.  To get a stable set, use the
381        list() function on the iterator, and loop over the resulting list.
382
383        *tag* is what tags to look for (default is to return all elements)
384
385        Return an iterator containing all the matching elements.
386
387        """
388        if tag == "*":
389            tag = None
390        if tag is None or self.tag == tag:
391            yield self
392        for e in self._children:
393            yield from e.iter(tag)
394
395    def itertext(self):
396        """Create text iterator.
397
398        The iterator loops over the element and all subelements in document
399        order, returning all inner text.
400
401        """
402        tag = self.tag
403        if not isinstance(tag, str) and tag is not None:
404            return
405        t = self.text
406        if t:
407            yield t
408        for e in self:
409            yield from e.itertext()
410            t = e.tail
411            if t:
412                yield t
413
414
415def SubElement(parent, tag, attrib={}, **extra):
416    """Subelement factory which creates an element instance, and appends it
417    to an existing parent.
418
419    The element tag, attribute names, and attribute values can be either
420    bytes or Unicode strings.
421
422    *parent* is the parent element, *tag* is the subelements name, *attrib* is
423    an optional directory containing element attributes, *extra* are
424    additional attributes given as keyword arguments.
425
426    """
427    attrib = {**attrib, **extra}
428    element = parent.makeelement(tag, attrib)
429    parent.append(element)
430    return element
431
432
433def Comment(text=None):
434    """Comment element factory.
435
436    This function creates a special element which the standard serializer
437    serializes as an XML comment.
438
439    *text* is a string containing the comment string.
440
441    """
442    element = Element(Comment)
443    element.text = text
444    return element
445
446
447def ProcessingInstruction(target, text=None):
448    """Processing Instruction element factory.
449
450    This function creates a special element which the standard serializer
451    serializes as an XML comment.
452
453    *target* is a string containing the processing instruction, *text* is a
454    string containing the processing instruction contents, if any.
455
456    """
457    element = Element(ProcessingInstruction)
458    element.text = target
459    if text:
460        element.text = element.text + " " + text
461    return element
462
463PI = ProcessingInstruction
464
465
466class QName:
467    """Qualified name wrapper.
468
469    This class can be used to wrap a QName attribute value in order to get
470    proper namespace handing on output.
471
472    *text_or_uri* is a string containing the QName value either in the form
473    {uri}local, or if the tag argument is given, the URI part of a QName.
474
475    *tag* is an optional argument which if given, will make the first
476    argument (text_or_uri) be interpreted as a URI, and this argument (tag)
477    be interpreted as a local name.
478
479    """
480    def __init__(self, text_or_uri, tag=None):
481        if tag:
482            text_or_uri = "{%s}%s" % (text_or_uri, tag)
483        self.text = text_or_uri
484    def __str__(self):
485        return self.text
486    def __repr__(self):
487        return '<%s %r>' % (self.__class__.__name__, self.text)
488    def __hash__(self):
489        return hash(self.text)
490    def __le__(self, other):
491        if isinstance(other, QName):
492            return self.text <= other.text
493        return self.text <= other
494    def __lt__(self, other):
495        if isinstance(other, QName):
496            return self.text < other.text
497        return self.text < other
498    def __ge__(self, other):
499        if isinstance(other, QName):
500            return self.text >= other.text
501        return self.text >= other
502    def __gt__(self, other):
503        if isinstance(other, QName):
504            return self.text > other.text
505        return self.text > other
506    def __eq__(self, other):
507        if isinstance(other, QName):
508            return self.text == other.text
509        return self.text == other
510
511# --------------------------------------------------------------------
512
513
514class ElementTree:
515    """An XML element hierarchy.
516
517    This class also provides support for serialization to and from
518    standard XML.
519
520    *element* is an optional root element node,
521    *file* is an optional file handle or file name of an XML file whose
522    contents will be used to initialize the tree with.
523
524    """
525    def __init__(self, element=None, file=None):
526        # assert element is None or iselement(element)
527        self._root = element # first node
528        if file:
529            self.parse(file)
530
531    def getroot(self):
532        """Return root element of this tree."""
533        return self._root
534
535    def _setroot(self, element):
536        """Replace root element of this tree.
537
538        This will discard the current contents of the tree and replace it
539        with the given element.  Use with care!
540
541        """
542        # assert iselement(element)
543        self._root = element
544
545    def parse(self, source, parser=None):
546        """Load external XML document into element tree.
547
548        *source* is a file name or file object, *parser* is an optional parser
549        instance that defaults to XMLParser.
550
551        ParseError is raised if the parser fails to parse the document.
552
553        Returns the root element of the given source document.
554
555        """
556        close_source = False
557        if not hasattr(source, "read"):
558            source = open(source, "rb")
559            close_source = True
560        try:
561            if parser is None:
562                # If no parser was specified, create a default XMLParser
563                parser = XMLParser()
564                if hasattr(parser, '_parse_whole'):
565                    # The default XMLParser, when it comes from an accelerator,
566                    # can define an internal _parse_whole API for efficiency.
567                    # It can be used to parse the whole source without feeding
568                    # it with chunks.
569                    self._root = parser._parse_whole(source)
570                    return self._root
571            while data := source.read(65536):
572                parser.feed(data)
573            self._root = parser.close()
574            return self._root
575        finally:
576            if close_source:
577                source.close()
578
579    def iter(self, tag=None):
580        """Create and return tree iterator for the root element.
581
582        The iterator loops over all elements in this tree, in document order.
583
584        *tag* is a string with the tag name to iterate over
585        (default is to return all elements).
586
587        """
588        # assert self._root is not None
589        return self._root.iter(tag)
590
591    def find(self, path, namespaces=None):
592        """Find first matching element by tag name or path.
593
594        Same as getroot().find(path), which is Element.find()
595
596        *path* is a string having either an element tag or an XPath,
597        *namespaces* is an optional mapping from namespace prefix to full name.
598
599        Return the first matching element, or None if no element was found.
600
601        """
602        # assert self._root is not None
603        if path[:1] == "/":
604            path = "." + path
605            warnings.warn(
606                "This search is broken in 1.3 and earlier, and will be "
607                "fixed in a future version.  If you rely on the current "
608                "behaviour, change it to %r" % path,
609                FutureWarning, stacklevel=2
610                )
611        return self._root.find(path, namespaces)
612
613    def findtext(self, path, default=None, namespaces=None):
614        """Find first matching element by tag name or path.
615
616        Same as getroot().findtext(path),  which is Element.findtext()
617
618        *path* is a string having either an element tag or an XPath,
619        *namespaces* is an optional mapping from namespace prefix to full name.
620
621        Return the first matching element, or None if no element was found.
622
623        """
624        # assert self._root is not None
625        if path[:1] == "/":
626            path = "." + path
627            warnings.warn(
628                "This search is broken in 1.3 and earlier, and will be "
629                "fixed in a future version.  If you rely on the current "
630                "behaviour, change it to %r" % path,
631                FutureWarning, stacklevel=2
632                )
633        return self._root.findtext(path, default, namespaces)
634
635    def findall(self, path, namespaces=None):
636        """Find all matching subelements by tag name or path.
637
638        Same as getroot().findall(path), which is Element.findall().
639
640        *path* is a string having either an element tag or an XPath,
641        *namespaces* is an optional mapping from namespace prefix to full name.
642
643        Return list containing all matching elements in document order.
644
645        """
646        # assert self._root is not None
647        if path[:1] == "/":
648            path = "." + path
649            warnings.warn(
650                "This search is broken in 1.3 and earlier, and will be "
651                "fixed in a future version.  If you rely on the current "
652                "behaviour, change it to %r" % path,
653                FutureWarning, stacklevel=2
654                )
655        return self._root.findall(path, namespaces)
656
657    def iterfind(self, path, namespaces=None):
658        """Find all matching subelements by tag name or path.
659
660        Same as getroot().iterfind(path), which is element.iterfind()
661
662        *path* is a string having either an element tag or an XPath,
663        *namespaces* is an optional mapping from namespace prefix to full name.
664
665        Return an iterable yielding all matching elements in document order.
666
667        """
668        # assert self._root is not None
669        if path[:1] == "/":
670            path = "." + path
671            warnings.warn(
672                "This search is broken in 1.3 and earlier, and will be "
673                "fixed in a future version.  If you rely on the current "
674                "behaviour, change it to %r" % path,
675                FutureWarning, stacklevel=2
676                )
677        return self._root.iterfind(path, namespaces)
678
679    def write(self, file_or_filename,
680              encoding=None,
681              xml_declaration=None,
682              default_namespace=None,
683              method=None, *,
684              short_empty_elements=True):
685        """Write element tree to a file as XML.
686
687        Arguments:
688          *file_or_filename* -- file name or a file object opened for writing
689
690          *encoding* -- the output encoding (default: US-ASCII)
691
692          *xml_declaration* -- bool indicating if an XML declaration should be
693                               added to the output. If None, an XML declaration
694                               is added if encoding IS NOT either of:
695                               US-ASCII, UTF-8, or Unicode
696
697          *default_namespace* -- sets the default XML namespace (for "xmlns")
698
699          *method* -- either "xml" (default), "html, "text", or "c14n"
700
701          *short_empty_elements* -- controls the formatting of elements
702                                    that contain no content. If True (default)
703                                    they are emitted as a single self-closed
704                                    tag, otherwise they are emitted as a pair
705                                    of start/end tags
706
707        """
708        if not method:
709            method = "xml"
710        elif method not in _serialize:
711            raise ValueError("unknown method %r" % method)
712        if not encoding:
713            if method == "c14n":
714                encoding = "utf-8"
715            else:
716                encoding = "us-ascii"
717        with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
718            if method == "xml" and (xml_declaration or
719                    (xml_declaration is None and
720                     encoding.lower() != "unicode" and
721                     declared_encoding.lower() not in ("utf-8", "us-ascii"))):
722                write("<?xml version='1.0' encoding='%s'?>\n" % (
723                    declared_encoding,))
724            if method == "text":
725                _serialize_text(write, self._root)
726            else:
727                qnames, namespaces = _namespaces(self._root, default_namespace)
728                serialize = _serialize[method]
729                serialize(write, self._root, qnames, namespaces,
730                          short_empty_elements=short_empty_elements)
731
732    def write_c14n(self, file):
733        # lxml.etree compatibility.  use output method instead
734        return self.write(file, method="c14n")
735
736# --------------------------------------------------------------------
737# serialization support
738
739@contextlib.contextmanager
740def _get_writer(file_or_filename, encoding):
741    # returns text write method and release all resources after using
742    try:
743        write = file_or_filename.write
744    except AttributeError:
745        # file_or_filename is a file name
746        if encoding.lower() == "unicode":
747            encoding="utf-8"
748        with open(file_or_filename, "w", encoding=encoding,
749                  errors="xmlcharrefreplace") as file:
750            yield file.write, encoding
751    else:
752        # file_or_filename is a file-like object
753        # encoding determines if it is a text or binary writer
754        if encoding.lower() == "unicode":
755            # use a text writer as is
756            yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
757        else:
758            # wrap a binary writer with TextIOWrapper
759            with contextlib.ExitStack() as stack:
760                if isinstance(file_or_filename, io.BufferedIOBase):
761                    file = file_or_filename
762                elif isinstance(file_or_filename, io.RawIOBase):
763                    file = io.BufferedWriter(file_or_filename)
764                    # Keep the original file open when the BufferedWriter is
765                    # destroyed
766                    stack.callback(file.detach)
767                else:
768                    # This is to handle passed objects that aren't in the
769                    # IOBase hierarchy, but just have a write method
770                    file = io.BufferedIOBase()
771                    file.writable = lambda: True
772                    file.write = write
773                    try:
774                        # TextIOWrapper uses this methods to determine
775                        # if BOM (for UTF-16, etc) should be added
776                        file.seekable = file_or_filename.seekable
777                        file.tell = file_or_filename.tell
778                    except AttributeError:
779                        pass
780                file = io.TextIOWrapper(file,
781                                        encoding=encoding,
782                                        errors="xmlcharrefreplace",
783                                        newline="\n")
784                # Keep the original file open when the TextIOWrapper is
785                # destroyed
786                stack.callback(file.detach)
787                yield file.write, encoding
788
789def _namespaces(elem, default_namespace=None):
790    # identify namespaces used in this tree
791
792    # maps qnames to *encoded* prefix:local names
793    qnames = {None: None}
794
795    # maps uri:s to prefixes
796    namespaces = {}
797    if default_namespace:
798        namespaces[default_namespace] = ""
799
800    def add_qname(qname):
801        # calculate serialized qname representation
802        try:
803            if qname[:1] == "{":
804                uri, tag = qname[1:].rsplit("}", 1)
805                prefix = namespaces.get(uri)
806                if prefix is None:
807                    prefix = _namespace_map.get(uri)
808                    if prefix is None:
809                        prefix = "ns%d" % len(namespaces)
810                    if prefix != "xml":
811                        namespaces[uri] = prefix
812                if prefix:
813                    qnames[qname] = "%s:%s" % (prefix, tag)
814                else:
815                    qnames[qname] = tag # default element
816            else:
817                if default_namespace:
818                    # FIXME: can this be handled in XML 1.0?
819                    raise ValueError(
820                        "cannot use non-qualified names with "
821                        "default_namespace option"
822                        )
823                qnames[qname] = qname
824        except TypeError:
825            _raise_serialization_error(qname)
826
827    # populate qname and namespaces table
828    for elem in elem.iter():
829        tag = elem.tag
830        if isinstance(tag, QName):
831            if tag.text not in qnames:
832                add_qname(tag.text)
833        elif isinstance(tag, str):
834            if tag not in qnames:
835                add_qname(tag)
836        elif tag is not None and tag is not Comment and tag is not PI:
837            _raise_serialization_error(tag)
838        for key, value in elem.items():
839            if isinstance(key, QName):
840                key = key.text
841            if key not in qnames:
842                add_qname(key)
843            if isinstance(value, QName) and value.text not in qnames:
844                add_qname(value.text)
845        text = elem.text
846        if isinstance(text, QName) and text.text not in qnames:
847            add_qname(text.text)
848    return qnames, namespaces
849
850def _serialize_xml(write, elem, qnames, namespaces,
851                   short_empty_elements, **kwargs):
852    tag = elem.tag
853    text = elem.text
854    if tag is Comment:
855        write("<!--%s-->" % text)
856    elif tag is ProcessingInstruction:
857        write("<?%s?>" % text)
858    else:
859        tag = qnames[tag]
860        if tag is None:
861            if text:
862                write(_escape_cdata(text))
863            for e in elem:
864                _serialize_xml(write, e, qnames, None,
865                               short_empty_elements=short_empty_elements)
866        else:
867            write("<" + tag)
868            items = list(elem.items())
869            if items or namespaces:
870                if namespaces:
871                    for v, k in sorted(namespaces.items(),
872                                       key=lambda x: x[1]):  # sort on prefix
873                        if k:
874                            k = ":" + k
875                        write(" xmlns%s=\"%s\"" % (
876                            k,
877                            _escape_attrib(v)
878                            ))
879                for k, v in items:
880                    if isinstance(k, QName):
881                        k = k.text
882                    if isinstance(v, QName):
883                        v = qnames[v.text]
884                    else:
885                        v = _escape_attrib(v)
886                    write(" %s=\"%s\"" % (qnames[k], v))
887            if text or len(elem) or not short_empty_elements:
888                write(">")
889                if text:
890                    write(_escape_cdata(text))
891                for e in elem:
892                    _serialize_xml(write, e, qnames, None,
893                                   short_empty_elements=short_empty_elements)
894                write("</" + tag + ">")
895            else:
896                write(" />")
897    if elem.tail:
898        write(_escape_cdata(elem.tail))
899
900HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
901              "img", "input", "isindex", "link", "meta", "param", "source",
902              "track", "wbr"}
903
904def _serialize_html(write, elem, qnames, namespaces, **kwargs):
905    tag = elem.tag
906    text = elem.text
907    if tag is Comment:
908        write("<!--%s-->" % _escape_cdata(text))
909    elif tag is ProcessingInstruction:
910        write("<?%s?>" % _escape_cdata(text))
911    else:
912        tag = qnames[tag]
913        if tag is None:
914            if text:
915                write(_escape_cdata(text))
916            for e in elem:
917                _serialize_html(write, e, qnames, None)
918        else:
919            write("<" + tag)
920            items = list(elem.items())
921            if items or namespaces:
922                if namespaces:
923                    for v, k in sorted(namespaces.items(),
924                                       key=lambda x: x[1]):  # sort on prefix
925                        if k:
926                            k = ":" + k
927                        write(" xmlns%s=\"%s\"" % (
928                            k,
929                            _escape_attrib(v)
930                            ))
931                for k, v in items:
932                    if isinstance(k, QName):
933                        k = k.text
934                    if isinstance(v, QName):
935                        v = qnames[v.text]
936                    else:
937                        v = _escape_attrib_html(v)
938                    # FIXME: handle boolean attributes
939                    write(" %s=\"%s\"" % (qnames[k], v))
940            write(">")
941            ltag = tag.lower()
942            if text:
943                if ltag == "script" or ltag == "style":
944                    write(text)
945                else:
946                    write(_escape_cdata(text))
947            for e in elem:
948                _serialize_html(write, e, qnames, None)
949            if ltag not in HTML_EMPTY:
950                write("</" + tag + ">")
951    if elem.tail:
952        write(_escape_cdata(elem.tail))
953
954def _serialize_text(write, elem):
955    for part in elem.itertext():
956        write(part)
957    if elem.tail:
958        write(elem.tail)
959
960_serialize = {
961    "xml": _serialize_xml,
962    "html": _serialize_html,
963    "text": _serialize_text,
964# this optional method is imported at the end of the module
965#   "c14n": _serialize_c14n,
966}
967
968
969def register_namespace(prefix, uri):
970    """Register a namespace prefix.
971
972    The registry is global, and any existing mapping for either the
973    given prefix or the namespace URI will be removed.
974
975    *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
976    attributes in this namespace will be serialized with prefix if possible.
977
978    ValueError is raised if prefix is reserved or is invalid.
979
980    """
981    if re.match(r"ns\d+$", prefix):
982        raise ValueError("Prefix format reserved for internal use")
983    for k, v in list(_namespace_map.items()):
984        if k == uri or v == prefix:
985            del _namespace_map[k]
986    _namespace_map[uri] = prefix
987
988_namespace_map = {
989    # "well-known" namespace prefixes
990    "http://www.w3.org/XML/1998/namespace": "xml",
991    "http://www.w3.org/1999/xhtml": "html",
992    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
993    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
994    # xml schema
995    "http://www.w3.org/2001/XMLSchema": "xs",
996    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
997    # dublin core
998    "http://purl.org/dc/elements/1.1/": "dc",
999}
1000# For tests and troubleshooting
1001register_namespace._namespace_map = _namespace_map
1002
1003def _raise_serialization_error(text):
1004    raise TypeError(
1005        "cannot serialize %r (type %s)" % (text, type(text).__name__)
1006        )
1007
1008def _escape_cdata(text):
1009    # escape character data
1010    try:
1011        # it's worth avoiding do-nothing calls for strings that are
1012        # shorter than 500 characters, or so.  assume that's, by far,
1013        # the most common case in most applications.
1014        if "&" in text:
1015            text = text.replace("&", "&amp;")
1016        if "<" in text:
1017            text = text.replace("<", "&lt;")
1018        if ">" in text:
1019            text = text.replace(">", "&gt;")
1020        return text
1021    except (TypeError, AttributeError):
1022        _raise_serialization_error(text)
1023
1024def _escape_attrib(text):
1025    # escape attribute value
1026    try:
1027        if "&" in text:
1028            text = text.replace("&", "&amp;")
1029        if "<" in text:
1030            text = text.replace("<", "&lt;")
1031        if ">" in text:
1032            text = text.replace(">", "&gt;")
1033        if "\"" in text:
1034            text = text.replace("\"", "&quot;")
1035        # Although section 2.11 of the XML specification states that CR or
1036        # CR LN should be replaced with just LN, it applies only to EOLNs
1037        # which take part of organizing file into lines. Within attributes,
1038        # we are replacing these with entity numbers, so they do not count.
1039        # http://www.w3.org/TR/REC-xml/#sec-line-ends
1040        # The current solution, contained in following six lines, was
1041        # discussed in issue 17582 and 39011.
1042        if "\r" in text:
1043            text = text.replace("\r", "&#13;")
1044        if "\n" in text:
1045            text = text.replace("\n", "&#10;")
1046        if "\t" in text:
1047            text = text.replace("\t", "&#09;")
1048        return text
1049    except (TypeError, AttributeError):
1050        _raise_serialization_error(text)
1051
1052def _escape_attrib_html(text):
1053    # escape attribute value
1054    try:
1055        if "&" in text:
1056            text = text.replace("&", "&amp;")
1057        if ">" in text:
1058            text = text.replace(">", "&gt;")
1059        if "\"" in text:
1060            text = text.replace("\"", "&quot;")
1061        return text
1062    except (TypeError, AttributeError):
1063        _raise_serialization_error(text)
1064
1065# --------------------------------------------------------------------
1066
1067def tostring(element, encoding=None, method=None, *,
1068             xml_declaration=None, default_namespace=None,
1069             short_empty_elements=True):
1070    """Generate string representation of XML element.
1071
1072    All subelements are included.  If encoding is "unicode", a string
1073    is returned. Otherwise a bytestring is returned.
1074
1075    *element* is an Element instance, *encoding* is an optional output
1076    encoding defaulting to US-ASCII, *method* is an optional output which can
1077    be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1078    sets the default XML namespace (for "xmlns").
1079
1080    Returns an (optionally) encoded string containing the XML data.
1081
1082    """
1083    stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1084    ElementTree(element).write(stream, encoding,
1085                               xml_declaration=xml_declaration,
1086                               default_namespace=default_namespace,
1087                               method=method,
1088                               short_empty_elements=short_empty_elements)
1089    return stream.getvalue()
1090
1091class _ListDataStream(io.BufferedIOBase):
1092    """An auxiliary stream accumulating into a list reference."""
1093    def __init__(self, lst):
1094        self.lst = lst
1095
1096    def writable(self):
1097        return True
1098
1099    def seekable(self):
1100        return True
1101
1102    def write(self, b):
1103        self.lst.append(b)
1104
1105    def tell(self):
1106        return len(self.lst)
1107
1108def tostringlist(element, encoding=None, method=None, *,
1109                 xml_declaration=None, default_namespace=None,
1110                 short_empty_elements=True):
1111    lst = []
1112    stream = _ListDataStream(lst)
1113    ElementTree(element).write(stream, encoding,
1114                               xml_declaration=xml_declaration,
1115                               default_namespace=default_namespace,
1116                               method=method,
1117                               short_empty_elements=short_empty_elements)
1118    return lst
1119
1120
1121def dump(elem):
1122    """Write element tree or element structure to sys.stdout.
1123
1124    This function should be used for debugging only.
1125
1126    *elem* is either an ElementTree, or a single Element.  The exact output
1127    format is implementation dependent.  In this version, it's written as an
1128    ordinary XML file.
1129
1130    """
1131    # debugging
1132    if not isinstance(elem, ElementTree):
1133        elem = ElementTree(elem)
1134    elem.write(sys.stdout, encoding="unicode")
1135    tail = elem.getroot().tail
1136    if not tail or tail[-1] != "\n":
1137        sys.stdout.write("\n")
1138
1139
1140def indent(tree, space="  ", level=0):
1141    """Indent an XML document by inserting newlines and indentation space
1142    after elements.
1143
1144    *tree* is the ElementTree or Element to modify.  The (root) element
1145    itself will not be changed, but the tail text of all elements in its
1146    subtree will be adapted.
1147
1148    *space* is the whitespace to insert for each indentation level, two
1149    space characters by default.
1150
1151    *level* is the initial indentation level. Setting this to a higher
1152    value than 0 can be used for indenting subtrees that are more deeply
1153    nested inside of a document.
1154    """
1155    if isinstance(tree, ElementTree):
1156        tree = tree.getroot()
1157    if level < 0:
1158        raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1159    if not len(tree):
1160        return
1161
1162    # Reduce the memory consumption by reusing indentation strings.
1163    indentations = ["\n" + level * space]
1164
1165    def _indent_children(elem, level):
1166        # Start a new indentation level for the first child.
1167        child_level = level + 1
1168        try:
1169            child_indentation = indentations[child_level]
1170        except IndexError:
1171            child_indentation = indentations[level] + space
1172            indentations.append(child_indentation)
1173
1174        if not elem.text or not elem.text.strip():
1175            elem.text = child_indentation
1176
1177        for child in elem:
1178            if len(child):
1179                _indent_children(child, child_level)
1180            if not child.tail or not child.tail.strip():
1181                child.tail = child_indentation
1182
1183        # Dedent after the last child by overwriting the previous indentation.
1184        if not child.tail.strip():
1185            child.tail = indentations[level]
1186
1187    _indent_children(tree, 0)
1188
1189
1190# --------------------------------------------------------------------
1191# parsing
1192
1193
1194def parse(source, parser=None):
1195    """Parse XML document into element tree.
1196
1197    *source* is a filename or file object containing XML data,
1198    *parser* is an optional parser instance defaulting to XMLParser.
1199
1200    Return an ElementTree instance.
1201
1202    """
1203    tree = ElementTree()
1204    tree.parse(source, parser)
1205    return tree
1206
1207
1208def iterparse(source, events=None, parser=None):
1209    """Incrementally parse XML document into ElementTree.
1210
1211    This class also reports what's going on to the user based on the
1212    *events* it is initialized with.  The supported events are the strings
1213    "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1214    detailed namespace information).  If *events* is omitted, only
1215    "end" events are reported.
1216
1217    *source* is a filename or file object containing XML data, *events* is
1218    a list of events to report back, *parser* is an optional parser instance.
1219
1220    Returns an iterator providing (event, elem) pairs.
1221
1222    """
1223    # Use the internal, undocumented _parser argument for now; When the
1224    # parser argument of iterparse is removed, this can be killed.
1225    pullparser = XMLPullParser(events=events, _parser=parser)
1226
1227    if not hasattr(source, "read"):
1228        source = open(source, "rb")
1229        close_source = True
1230    else:
1231        close_source = False
1232
1233    def iterator(source):
1234        try:
1235            while True:
1236                yield from pullparser.read_events()
1237                # load event buffer
1238                data = source.read(16 * 1024)
1239                if not data:
1240                    break
1241                pullparser.feed(data)
1242            root = pullparser._close_and_return_root()
1243            yield from pullparser.read_events()
1244            it = wr()
1245            if it is not None:
1246                it.root = root
1247        finally:
1248            if close_source:
1249                source.close()
1250
1251    gen = iterator(source)
1252    class IterParseIterator(collections.abc.Iterator):
1253        __next__ = gen.__next__
1254        def close(self):
1255            if close_source:
1256                source.close()
1257            gen.close()
1258
1259        def __del__(self):
1260            # TODO: Emit a ResourceWarning if it was not explicitly closed.
1261            # (When the close() method will be supported in all maintained Python versions.)
1262            if close_source:
1263                source.close()
1264
1265    it = IterParseIterator()
1266    it.root = None
1267    wr = weakref.ref(it)
1268    return it
1269
1270
1271class XMLPullParser:
1272
1273    def __init__(self, events=None, *, _parser=None):
1274        # The _parser argument is for internal use only and must not be relied
1275        # upon in user code. It will be removed in a future release.
1276        # See https://bugs.python.org/issue17741 for more details.
1277
1278        self._events_queue = collections.deque()
1279        self._parser = _parser or XMLParser(target=TreeBuilder())
1280        # wire up the parser for event reporting
1281        if events is None:
1282            events = ("end",)
1283        self._parser._setevents(self._events_queue, events)
1284
1285    def feed(self, data):
1286        """Feed encoded data to parser."""
1287        if self._parser is None:
1288            raise ValueError("feed() called after end of stream")
1289        if data:
1290            try:
1291                self._parser.feed(data)
1292            except SyntaxError as exc:
1293                self._events_queue.append(exc)
1294
1295    def _close_and_return_root(self):
1296        # iterparse needs this to set its root attribute properly :(
1297        root = self._parser.close()
1298        self._parser = None
1299        return root
1300
1301    def close(self):
1302        """Finish feeding data to parser.
1303
1304        Unlike XMLParser, does not return the root element. Use
1305        read_events() to consume elements from XMLPullParser.
1306        """
1307        self._close_and_return_root()
1308
1309    def read_events(self):
1310        """Return an iterator over currently available (event, elem) pairs.
1311
1312        Events are consumed from the internal event queue as they are
1313        retrieved from the iterator.
1314        """
1315        events = self._events_queue
1316        while events:
1317            event = events.popleft()
1318            if isinstance(event, Exception):
1319                raise event
1320            else:
1321                yield event
1322
1323    def flush(self):
1324        if self._parser is None:
1325            raise ValueError("flush() called after end of stream")
1326        self._parser.flush()
1327
1328
1329def XML(text, parser=None):
1330    """Parse XML document from string constant.
1331
1332    This function can be used to embed "XML Literals" in Python code.
1333
1334    *text* is a string containing XML data, *parser* is an
1335    optional parser instance, defaulting to the standard XMLParser.
1336
1337    Returns an Element instance.
1338
1339    """
1340    if not parser:
1341        parser = XMLParser(target=TreeBuilder())
1342    parser.feed(text)
1343    return parser.close()
1344
1345
1346def XMLID(text, parser=None):
1347    """Parse XML document from string constant for its IDs.
1348
1349    *text* is a string containing XML data, *parser* is an
1350    optional parser instance, defaulting to the standard XMLParser.
1351
1352    Returns an (Element, dict) tuple, in which the
1353    dict maps element id:s to elements.
1354
1355    """
1356    if not parser:
1357        parser = XMLParser(target=TreeBuilder())
1358    parser.feed(text)
1359    tree = parser.close()
1360    ids = {}
1361    for elem in tree.iter():
1362        id = elem.get("id")
1363        if id:
1364            ids[id] = elem
1365    return tree, ids
1366
1367# Parse XML document from string constant.  Alias for XML().
1368fromstring = XML
1369
1370def fromstringlist(sequence, parser=None):
1371    """Parse XML document from sequence of string fragments.
1372
1373    *sequence* is a list of other sequence, *parser* is an optional parser
1374    instance, defaulting to the standard XMLParser.
1375
1376    Returns an Element instance.
1377
1378    """
1379    if not parser:
1380        parser = XMLParser(target=TreeBuilder())
1381    for text in sequence:
1382        parser.feed(text)
1383    return parser.close()
1384
1385# --------------------------------------------------------------------
1386
1387
1388class TreeBuilder:
1389    """Generic element structure builder.
1390
1391    This builder converts a sequence of start, data, and end method
1392    calls to a well-formed element structure.
1393
1394    You can use this class to build an element structure using a custom XML
1395    parser, or a parser for some other XML-like format.
1396
1397    *element_factory* is an optional element factory which is called
1398    to create new Element instances, as necessary.
1399
1400    *comment_factory* is a factory to create comments to be used instead of
1401    the standard factory.  If *insert_comments* is false (the default),
1402    comments will not be inserted into the tree.
1403
1404    *pi_factory* is a factory to create processing instructions to be used
1405    instead of the standard factory.  If *insert_pis* is false (the default),
1406    processing instructions will not be inserted into the tree.
1407    """
1408    def __init__(self, element_factory=None, *,
1409                 comment_factory=None, pi_factory=None,
1410                 insert_comments=False, insert_pis=False):
1411        self._data = [] # data collector
1412        self._elem = [] # element stack
1413        self._last = None # last element
1414        self._root = None # root element
1415        self._tail = None # true if we're after an end tag
1416        if comment_factory is None:
1417            comment_factory = Comment
1418        self._comment_factory = comment_factory
1419        self.insert_comments = insert_comments
1420        if pi_factory is None:
1421            pi_factory = ProcessingInstruction
1422        self._pi_factory = pi_factory
1423        self.insert_pis = insert_pis
1424        if element_factory is None:
1425            element_factory = Element
1426        self._factory = element_factory
1427
1428    def close(self):
1429        """Flush builder buffers and return toplevel document Element."""
1430        assert len(self._elem) == 0, "missing end tags"
1431        assert self._root is not None, "missing toplevel element"
1432        return self._root
1433
1434    def _flush(self):
1435        if self._data:
1436            if self._last is not None:
1437                text = "".join(self._data)
1438                if self._tail:
1439                    assert self._last.tail is None, "internal error (tail)"
1440                    self._last.tail = text
1441                else:
1442                    assert self._last.text is None, "internal error (text)"
1443                    self._last.text = text
1444            self._data = []
1445
1446    def data(self, data):
1447        """Add text to current element."""
1448        self._data.append(data)
1449
1450    def start(self, tag, attrs):
1451        """Open new element and return it.
1452
1453        *tag* is the element name, *attrs* is a dict containing element
1454        attributes.
1455
1456        """
1457        self._flush()
1458        self._last = elem = self._factory(tag, attrs)
1459        if self._elem:
1460            self._elem[-1].append(elem)
1461        elif self._root is None:
1462            self._root = elem
1463        self._elem.append(elem)
1464        self._tail = 0
1465        return elem
1466
1467    def end(self, tag):
1468        """Close and return current Element.
1469
1470        *tag* is the element name.
1471
1472        """
1473        self._flush()
1474        self._last = self._elem.pop()
1475        assert self._last.tag == tag,\
1476               "end tag mismatch (expected %s, got %s)" % (
1477                   self._last.tag, tag)
1478        self._tail = 1
1479        return self._last
1480
1481    def comment(self, text):
1482        """Create a comment using the comment_factory.
1483
1484        *text* is the text of the comment.
1485        """
1486        return self._handle_single(
1487            self._comment_factory, self.insert_comments, text)
1488
1489    def pi(self, target, text=None):
1490        """Create a processing instruction using the pi_factory.
1491
1492        *target* is the target name of the processing instruction.
1493        *text* is the data of the processing instruction, or ''.
1494        """
1495        return self._handle_single(
1496            self._pi_factory, self.insert_pis, target, text)
1497
1498    def _handle_single(self, factory, insert, *args):
1499        elem = factory(*args)
1500        if insert:
1501            self._flush()
1502            self._last = elem
1503            if self._elem:
1504                self._elem[-1].append(elem)
1505            self._tail = 1
1506        return elem
1507
1508
1509# also see ElementTree and TreeBuilder
1510class XMLParser:
1511    """Element structure builder for XML source data based on the expat parser.
1512
1513    *target* is an optional target object which defaults to an instance of the
1514    standard TreeBuilder class, *encoding* is an optional encoding string
1515    which if given, overrides the encoding specified in the XML file:
1516    http://www.iana.org/assignments/character-sets
1517
1518    """
1519
1520    def __init__(self, *, target=None, encoding=None):
1521        try:
1522            from xml.parsers import expat
1523        except ImportError:
1524            try:
1525                import pyexpat as expat
1526            except ImportError:
1527                raise ImportError(
1528                    "No module named expat; use SimpleXMLTreeBuilder instead"
1529                    )
1530        parser = expat.ParserCreate(encoding, "}")
1531        if target is None:
1532            target = TreeBuilder()
1533        # underscored names are provided for compatibility only
1534        self.parser = self._parser = parser
1535        self.target = self._target = target
1536        self._error = expat.error
1537        self._names = {} # name memo cache
1538        # main callbacks
1539        parser.DefaultHandlerExpand = self._default
1540        if hasattr(target, 'start'):
1541            parser.StartElementHandler = self._start
1542        if hasattr(target, 'end'):
1543            parser.EndElementHandler = self._end
1544        if hasattr(target, 'start_ns'):
1545            parser.StartNamespaceDeclHandler = self._start_ns
1546        if hasattr(target, 'end_ns'):
1547            parser.EndNamespaceDeclHandler = self._end_ns
1548        if hasattr(target, 'data'):
1549            parser.CharacterDataHandler = target.data
1550        # miscellaneous callbacks
1551        if hasattr(target, 'comment'):
1552            parser.CommentHandler = target.comment
1553        if hasattr(target, 'pi'):
1554            parser.ProcessingInstructionHandler = target.pi
1555        # Configure pyexpat: buffering, new-style attribute handling.
1556        parser.buffer_text = 1
1557        parser.ordered_attributes = 1
1558        self._doctype = None
1559        self.entity = {}
1560        try:
1561            self.version = "Expat %d.%d.%d" % expat.version_info
1562        except AttributeError:
1563            pass # unknown
1564
1565    def _setevents(self, events_queue, events_to_report):
1566        # Internal API for XMLPullParser
1567        # events_to_report: a list of events to report during parsing (same as
1568        # the *events* of XMLPullParser's constructor.
1569        # events_queue: a list of actual parsing events that will be populated
1570        # by the underlying parser.
1571        #
1572        parser = self._parser
1573        append = events_queue.append
1574        for event_name in events_to_report:
1575            if event_name == "start":
1576                parser.ordered_attributes = 1
1577                def handler(tag, attrib_in, event=event_name, append=append,
1578                            start=self._start):
1579                    append((event, start(tag, attrib_in)))
1580                parser.StartElementHandler = handler
1581            elif event_name == "end":
1582                def handler(tag, event=event_name, append=append,
1583                            end=self._end):
1584                    append((event, end(tag)))
1585                parser.EndElementHandler = handler
1586            elif event_name == "start-ns":
1587                # TreeBuilder does not implement .start_ns()
1588                if hasattr(self.target, "start_ns"):
1589                    def handler(prefix, uri, event=event_name, append=append,
1590                                start_ns=self._start_ns):
1591                        append((event, start_ns(prefix, uri)))
1592                else:
1593                    def handler(prefix, uri, event=event_name, append=append):
1594                        append((event, (prefix or '', uri or '')))
1595                parser.StartNamespaceDeclHandler = handler
1596            elif event_name == "end-ns":
1597                # TreeBuilder does not implement .end_ns()
1598                if hasattr(self.target, "end_ns"):
1599                    def handler(prefix, event=event_name, append=append,
1600                                end_ns=self._end_ns):
1601                        append((event, end_ns(prefix)))
1602                else:
1603                    def handler(prefix, event=event_name, append=append):
1604                        append((event, None))
1605                parser.EndNamespaceDeclHandler = handler
1606            elif event_name == 'comment':
1607                def handler(text, event=event_name, append=append, self=self):
1608                    append((event, self.target.comment(text)))
1609                parser.CommentHandler = handler
1610            elif event_name == 'pi':
1611                def handler(pi_target, data, event=event_name, append=append,
1612                            self=self):
1613                    append((event, self.target.pi(pi_target, data)))
1614                parser.ProcessingInstructionHandler = handler
1615            else:
1616                raise ValueError("unknown event %r" % event_name)
1617
1618    def _raiseerror(self, value):
1619        err = ParseError(value)
1620        err.code = value.code
1621        err.position = value.lineno, value.offset
1622        raise err
1623
1624    def _fixname(self, key):
1625        # expand qname, and convert name string to ascii, if possible
1626        try:
1627            name = self._names[key]
1628        except KeyError:
1629            name = key
1630            if "}" in name:
1631                name = "{" + name
1632            self._names[key] = name
1633        return name
1634
1635    def _start_ns(self, prefix, uri):
1636        return self.target.start_ns(prefix or '', uri or '')
1637
1638    def _end_ns(self, prefix):
1639        return self.target.end_ns(prefix or '')
1640
1641    def _start(self, tag, attr_list):
1642        # Handler for expat's StartElementHandler. Since ordered_attributes
1643        # is set, the attributes are reported as a list of alternating
1644        # attribute name,value.
1645        fixname = self._fixname
1646        tag = fixname(tag)
1647        attrib = {}
1648        if attr_list:
1649            for i in range(0, len(attr_list), 2):
1650                attrib[fixname(attr_list[i])] = attr_list[i+1]
1651        return self.target.start(tag, attrib)
1652
1653    def _end(self, tag):
1654        return self.target.end(self._fixname(tag))
1655
1656    def _default(self, text):
1657        prefix = text[:1]
1658        if prefix == "&":
1659            # deal with undefined entities
1660            try:
1661                data_handler = self.target.data
1662            except AttributeError:
1663                return
1664            try:
1665                data_handler(self.entity[text[1:-1]])
1666            except KeyError:
1667                from xml.parsers import expat
1668                err = expat.error(
1669                    "undefined entity %s: line %d, column %d" %
1670                    (text, self.parser.ErrorLineNumber,
1671                    self.parser.ErrorColumnNumber)
1672                    )
1673                err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1674                err.lineno = self.parser.ErrorLineNumber
1675                err.offset = self.parser.ErrorColumnNumber
1676                raise err
1677        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1678            self._doctype = [] # inside a doctype declaration
1679        elif self._doctype is not None:
1680            # parse doctype contents
1681            if prefix == ">":
1682                self._doctype = None
1683                return
1684            text = text.strip()
1685            if not text:
1686                return
1687            self._doctype.append(text)
1688            n = len(self._doctype)
1689            if n > 2:
1690                type = self._doctype[1]
1691                if type == "PUBLIC" and n == 4:
1692                    name, type, pubid, system = self._doctype
1693                    if pubid:
1694                        pubid = pubid[1:-1]
1695                elif type == "SYSTEM" and n == 3:
1696                    name, type, system = self._doctype
1697                    pubid = None
1698                else:
1699                    return
1700                if hasattr(self.target, "doctype"):
1701                    self.target.doctype(name, pubid, system[1:-1])
1702                elif hasattr(self, "doctype"):
1703                    warnings.warn(
1704                        "The doctype() method of XMLParser is ignored.  "
1705                        "Define doctype() method on the TreeBuilder target.",
1706                        RuntimeWarning)
1707
1708                self._doctype = None
1709
1710    def feed(self, data):
1711        """Feed encoded data to parser."""
1712        try:
1713            self.parser.Parse(data, False)
1714        except self._error as v:
1715            self._raiseerror(v)
1716
1717    def close(self):
1718        """Finish feeding data to parser and return element structure."""
1719        try:
1720            self.parser.Parse(b"", True) # end of data
1721        except self._error as v:
1722            self._raiseerror(v)
1723        try:
1724            close_handler = self.target.close
1725        except AttributeError:
1726            pass
1727        else:
1728            return close_handler()
1729        finally:
1730            # get rid of circular references
1731            del self.parser, self._parser
1732            del self.target, self._target
1733
1734    def flush(self):
1735        was_enabled = self.parser.GetReparseDeferralEnabled()
1736        try:
1737            self.parser.SetReparseDeferralEnabled(False)
1738            self.parser.Parse(b"", False)
1739        except self._error as v:
1740            self._raiseerror(v)
1741        finally:
1742            self.parser.SetReparseDeferralEnabled(was_enabled)
1743
1744# --------------------------------------------------------------------
1745# C14N 2.0
1746
1747def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1748    """Convert XML to its C14N 2.0 serialised form.
1749
1750    If *out* is provided, it must be a file or file-like object that receives
1751    the serialised canonical XML output (text, not bytes) through its ``.write()``
1752    method.  To write to a file, open it in text mode with encoding "utf-8".
1753    If *out* is not provided, this function returns the output as text string.
1754
1755    Either *xml_data* (an XML string) or *from_file* (a file path or
1756    file-like object) must be provided as input.
1757
1758    The configuration options are the same as for the ``C14NWriterTarget``.
1759    """
1760    if xml_data is None and from_file is None:
1761        raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1762    sio = None
1763    if out is None:
1764        sio = out = io.StringIO()
1765
1766    parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1767
1768    if xml_data is not None:
1769        parser.feed(xml_data)
1770        parser.close()
1771    elif from_file is not None:
1772        parse(from_file, parser=parser)
1773
1774    return sio.getvalue() if sio is not None else None
1775
1776
1777_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1778
1779
1780class C14NWriterTarget:
1781    """
1782    Canonicalization writer target for the XMLParser.
1783
1784    Serialises parse events to XML C14N 2.0.
1785
1786    The *write* function is used for writing out the resulting data stream
1787    as text (not bytes).  To write to a file, open it in text mode with encoding
1788    "utf-8" and pass its ``.write`` method.
1789
1790    Configuration options:
1791
1792    - *with_comments*: set to true to include comments
1793    - *strip_text*: set to true to strip whitespace before and after text content
1794    - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1795    - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1796                          should be replaced in text content
1797    - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1798                           should be replaced in text content
1799    - *exclude_attrs*: a set of attribute names that should not be serialised
1800    - *exclude_tags*: a set of tag names that should not be serialised
1801    """
1802    def __init__(self, write, *,
1803                 with_comments=False, strip_text=False, rewrite_prefixes=False,
1804                 qname_aware_tags=None, qname_aware_attrs=None,
1805                 exclude_attrs=None, exclude_tags=None):
1806        self._write = write
1807        self._data = []
1808        self._with_comments = with_comments
1809        self._strip_text = strip_text
1810        self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1811        self._exclude_tags = set(exclude_tags) if exclude_tags else None
1812
1813        self._rewrite_prefixes = rewrite_prefixes
1814        if qname_aware_tags:
1815            self._qname_aware_tags = set(qname_aware_tags)
1816        else:
1817            self._qname_aware_tags = None
1818        if qname_aware_attrs:
1819            self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1820        else:
1821            self._find_qname_aware_attrs = None
1822
1823        # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1824        self._declared_ns_stack = [[
1825            ("http://www.w3.org/XML/1998/namespace", "xml"),
1826        ]]
1827        # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1828        self._ns_stack = []
1829        if not rewrite_prefixes:
1830            self._ns_stack.append(list(_namespace_map.items()))
1831        self._ns_stack.append([])
1832        self._prefix_map = {}
1833        self._preserve_space = [False]
1834        self._pending_start = None
1835        self._root_seen = False
1836        self._root_done = False
1837        self._ignored_depth = 0
1838
1839    def _iter_namespaces(self, ns_stack, _reversed=reversed):
1840        for namespaces in _reversed(ns_stack):
1841            if namespaces:  # almost no element declares new namespaces
1842                yield from namespaces
1843
1844    def _resolve_prefix_name(self, prefixed_name):
1845        prefix, name = prefixed_name.split(':', 1)
1846        for uri, p in self._iter_namespaces(self._ns_stack):
1847            if p == prefix:
1848                return f'{{{uri}}}{name}'
1849        raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1850
1851    def _qname(self, qname, uri=None):
1852        if uri is None:
1853            uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1854        else:
1855            tag = qname
1856
1857        prefixes_seen = set()
1858        for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1859            if u == uri and prefix not in prefixes_seen:
1860                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1861            prefixes_seen.add(prefix)
1862
1863        # Not declared yet => add new declaration.
1864        if self._rewrite_prefixes:
1865            if uri in self._prefix_map:
1866                prefix = self._prefix_map[uri]
1867            else:
1868                prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1869            self._declared_ns_stack[-1].append((uri, prefix))
1870            return f'{prefix}:{tag}', tag, uri
1871
1872        if not uri and '' not in prefixes_seen:
1873            # No default namespace declared => no prefix needed.
1874            return tag, tag, uri
1875
1876        for u, prefix in self._iter_namespaces(self._ns_stack):
1877            if u == uri:
1878                self._declared_ns_stack[-1].append((uri, prefix))
1879                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1880
1881        if not uri:
1882            # As soon as a default namespace is defined,
1883            # anything that has no namespace (and thus, no prefix) goes there.
1884            return tag, tag, uri
1885
1886        raise ValueError(f'Namespace "{uri}" is not declared in scope')
1887
1888    def data(self, data):
1889        if not self._ignored_depth:
1890            self._data.append(data)
1891
1892    def _flush(self, _join_text=''.join):
1893        data = _join_text(self._data)
1894        del self._data[:]
1895        if self._strip_text and not self._preserve_space[-1]:
1896            data = data.strip()
1897        if self._pending_start is not None:
1898            args, self._pending_start = self._pending_start, None
1899            qname_text = data if data and _looks_like_prefix_name(data) else None
1900            self._start(*args, qname_text)
1901            if qname_text is not None:
1902                return
1903        if data and self._root_seen:
1904            self._write(_escape_cdata_c14n(data))
1905
1906    def start_ns(self, prefix, uri):
1907        if self._ignored_depth:
1908            return
1909        # we may have to resolve qnames in text content
1910        if self._data:
1911            self._flush()
1912        self._ns_stack[-1].append((uri, prefix))
1913
1914    def start(self, tag, attrs):
1915        if self._exclude_tags is not None and (
1916                self._ignored_depth or tag in self._exclude_tags):
1917            self._ignored_depth += 1
1918            return
1919        if self._data:
1920            self._flush()
1921
1922        new_namespaces = []
1923        self._declared_ns_stack.append(new_namespaces)
1924
1925        if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1926            # Need to parse text first to see if it requires a prefix declaration.
1927            self._pending_start = (tag, attrs, new_namespaces)
1928            return
1929        self._start(tag, attrs, new_namespaces)
1930
1931    def _start(self, tag, attrs, new_namespaces, qname_text=None):
1932        if self._exclude_attrs is not None and attrs:
1933            attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1934
1935        qnames = {tag, *attrs}
1936        resolved_names = {}
1937
1938        # Resolve prefixes in attribute and tag text.
1939        if qname_text is not None:
1940            qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1941            qnames.add(qname)
1942        if self._find_qname_aware_attrs is not None and attrs:
1943            qattrs = self._find_qname_aware_attrs(attrs)
1944            if qattrs:
1945                for attr_name in qattrs:
1946                    value = attrs[attr_name]
1947                    if _looks_like_prefix_name(value):
1948                        qname = resolved_names[value] = self._resolve_prefix_name(value)
1949                        qnames.add(qname)
1950            else:
1951                qattrs = None
1952        else:
1953            qattrs = None
1954
1955        # Assign prefixes in lexicographical order of used URIs.
1956        parse_qname = self._qname
1957        parsed_qnames = {n: parse_qname(n) for n in sorted(
1958            qnames, key=lambda n: n.split('}', 1))}
1959
1960        # Write namespace declarations in prefix order ...
1961        if new_namespaces:
1962            attr_list = [
1963                ('xmlns:' + prefix if prefix else 'xmlns', uri)
1964                for uri, prefix in new_namespaces
1965            ]
1966            attr_list.sort()
1967        else:
1968            # almost always empty
1969            attr_list = []
1970
1971        # ... followed by attributes in URI+name order
1972        if attrs:
1973            for k, v in sorted(attrs.items()):
1974                if qattrs is not None and k in qattrs and v in resolved_names:
1975                    v = parsed_qnames[resolved_names[v]][0]
1976                attr_qname, attr_name, uri = parsed_qnames[k]
1977                # No prefix for attributes in default ('') namespace.
1978                attr_list.append((attr_qname if uri else attr_name, v))
1979
1980        # Honour xml:space attributes.
1981        space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1982        self._preserve_space.append(
1983            space_behaviour == 'preserve' if space_behaviour
1984            else self._preserve_space[-1])
1985
1986        # Write the tag.
1987        write = self._write
1988        write('<' + parsed_qnames[tag][0])
1989        if attr_list:
1990            write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1991        write('>')
1992
1993        # Write the resolved qname text content.
1994        if qname_text is not None:
1995            write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1996
1997        self._root_seen = True
1998        self._ns_stack.append([])
1999
2000    def end(self, tag):
2001        if self._ignored_depth:
2002            self._ignored_depth -= 1
2003            return
2004        if self._data:
2005            self._flush()
2006        self._write(f'</{self._qname(tag)[0]}>')
2007        self._preserve_space.pop()
2008        self._root_done = len(self._preserve_space) == 1
2009        self._declared_ns_stack.pop()
2010        self._ns_stack.pop()
2011
2012    def comment(self, text):
2013        if not self._with_comments:
2014            return
2015        if self._ignored_depth:
2016            return
2017        if self._root_done:
2018            self._write('\n')
2019        elif self._root_seen and self._data:
2020            self._flush()
2021        self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2022        if not self._root_seen:
2023            self._write('\n')
2024
2025    def pi(self, target, data):
2026        if self._ignored_depth:
2027            return
2028        if self._root_done:
2029            self._write('\n')
2030        elif self._root_seen and self._data:
2031            self._flush()
2032        self._write(
2033            f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2034        if not self._root_seen:
2035            self._write('\n')
2036
2037
2038def _escape_cdata_c14n(text):
2039    # escape character data
2040    try:
2041        # it's worth avoiding do-nothing calls for strings that are
2042        # shorter than 500 character, or so.  assume that's, by far,
2043        # the most common case in most applications.
2044        if '&' in text:
2045            text = text.replace('&', '&amp;')
2046        if '<' in text:
2047            text = text.replace('<', '&lt;')
2048        if '>' in text:
2049            text = text.replace('>', '&gt;')
2050        if '\r' in text:
2051            text = text.replace('\r', '&#xD;')
2052        return text
2053    except (TypeError, AttributeError):
2054        _raise_serialization_error(text)
2055
2056
2057def _escape_attrib_c14n(text):
2058    # escape attribute value
2059    try:
2060        if '&' in text:
2061            text = text.replace('&', '&amp;')
2062        if '<' in text:
2063            text = text.replace('<', '&lt;')
2064        if '"' in text:
2065            text = text.replace('"', '&quot;')
2066        if '\t' in text:
2067            text = text.replace('\t', '&#x9;')
2068        if '\n' in text:
2069            text = text.replace('\n', '&#xA;')
2070        if '\r' in text:
2071            text = text.replace('\r', '&#xD;')
2072        return text
2073    except (TypeError, AttributeError):
2074        _raise_serialization_error(text)
2075
2076
2077# --------------------------------------------------------------------
2078
2079# Import the C accelerators
2080try:
2081    # Element is going to be shadowed by the C implementation. We need to keep
2082    # the Python version of it accessible for some "creative" by external code
2083    # (see tests)
2084    _Element_Py = Element
2085
2086    # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2087    from _elementtree import *
2088    from _elementtree import _set_factories
2089except ImportError:
2090    pass
2091else:
2092    _set_factories(Comment, ProcessingInstruction)
2093