• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree.  This module has two classes for this purpose:
5
6    1. ElementTree represents the whole XML document as a tree and
7
8    2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level.  Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary.  Each Element has a number of properties associated with it:
17
18    'tag' - a string containing the element's name.
19
20    'attributes' - a Python dictionary storing the element's attributes.
21
22    'text' - a string containing the element's text content.
23
24    'tail' - an optional string containing text after the element's end tag.
25
26    And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
36#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
39#
40# ElementTree
41# Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
42#
43# fredrik@pythonware.com
44# http://www.pythonware.com
45# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
48# Copyright (c) 1999-2008 by Fredrik Lundh
49#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74    # public symbols
75    "Comment",
76    "dump",
77    "Element", "ElementTree",
78    "fromstring", "fromstringlist",
79    "indent", "iselement", "iterparse",
80    "parse", "ParseError",
81    "PI", "ProcessingInstruction",
82    "QName",
83    "SubElement",
84    "tostring", "tostringlist",
85    "TreeBuilder",
86    "VERSION",
87    "XML", "XMLID",
88    "XMLParser", "XMLPullParser",
89    "register_namespace",
90    "canonicalize", "C14NWriterTarget",
91    ]
92
93VERSION = "1.3.0"
94
95import sys
96import re
97import warnings
98import io
99import collections
100import collections.abc
101import contextlib
102
103from . import ElementPath
104
105
106class ParseError(SyntaxError):
107    """An error when parsing an XML document.
108
109    In addition to its exception value, a ParseError contains
110    two extra attributes:
111        'code'     - the specific exception code
112        'position' - the line and column of the error
113
114    """
115    pass
116
117# --------------------------------------------------------------------
118
119
120def iselement(element):
121    """Return True if *element* appears to be an Element."""
122    return hasattr(element, 'tag')
123
124
125class Element:
126    """An XML element.
127
128    This class is the reference implementation of the Element interface.
129
130    An element's length is its number of subelements.  That means if you
131    want to check if an element is truly empty, you should check BOTH
132    its length AND its text attribute.
133
134    The element tag, attribute names, and attribute values can be either
135    bytes or strings.
136
137    *tag* is the element name.  *attrib* is an optional dictionary containing
138    element attributes. *extra* are additional element attributes given as
139    keyword arguments.
140
141    Example form:
142        <tag attrib>text<child/>...</tag>tail
143
144    """
145
146    tag = None
147    """The element's name."""
148
149    attrib = None
150    """Dictionary of the element's attributes."""
151
152    text = None
153    """
154    Text before first subelement. This is either a string or the value None.
155    Note that if there is no text, this attribute may be either
156    None or the empty string, depending on the parser.
157
158    """
159
160    tail = None
161    """
162    Text after this element's end tag, but before the next sibling element's
163    start tag.  This is either a string or the value None.  Note that if there
164    was no text, this attribute may be either None or an empty string,
165    depending on the parser.
166
167    """
168
169    def __init__(self, tag, attrib={}, **extra):
170        if not isinstance(attrib, dict):
171            raise TypeError("attrib must be dict, not %s" % (
172                attrib.__class__.__name__,))
173        self.tag = tag
174        self.attrib = {**attrib, **extra}
175        self._children = []
176
177    def __repr__(self):
178        return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180    def makeelement(self, tag, attrib):
181        """Create a new element with the same type.
182
183        *tag* is a string containing the element name.
184        *attrib* is a dictionary containing the element attributes.
185
186        Do not call this method, use the SubElement factory function instead.
187
188        """
189        return self.__class__(tag, attrib)
190
191    def copy(self):
192        """Return copy of current element.
193
194        This creates a shallow copy. Subelements will be shared with the
195        original tree.
196
197        """
198        warnings.warn(
199            "elem.copy() is deprecated. Use copy.copy(elem) instead.",
200            DeprecationWarning
201            )
202        return self.__copy__()
203
204    def __copy__(self):
205        elem = self.makeelement(self.tag, self.attrib)
206        elem.text = self.text
207        elem.tail = self.tail
208        elem[:] = self
209        return elem
210
211    def __len__(self):
212        return len(self._children)
213
214    def __bool__(self):
215        warnings.warn(
216            "The behavior of this method will change in future versions.  "
217            "Use specific 'len(elem)' or 'elem is not None' test instead.",
218            FutureWarning, stacklevel=2
219            )
220        return len(self._children) != 0 # emulate old behaviour, for now
221
222    def __getitem__(self, index):
223        return self._children[index]
224
225    def __setitem__(self, index, element):
226        if isinstance(index, slice):
227            for elt in element:
228                self._assert_is_element(elt)
229        else:
230            self._assert_is_element(element)
231        self._children[index] = element
232
233    def __delitem__(self, index):
234        del self._children[index]
235
236    def append(self, subelement):
237        """Add *subelement* to the end of this element.
238
239        The new element will appear in document order after the last existing
240        subelement (or directly after the text, if it's the first subelement),
241        but before the end tag for this element.
242
243        """
244        self._assert_is_element(subelement)
245        self._children.append(subelement)
246
247    def extend(self, elements):
248        """Append subelements from a sequence.
249
250        *elements* is a sequence with zero or more elements.
251
252        """
253        for element in elements:
254            self._assert_is_element(element)
255        self._children.extend(elements)
256
257    def insert(self, index, subelement):
258        """Insert *subelement* at position *index*."""
259        self._assert_is_element(subelement)
260        self._children.insert(index, subelement)
261
262    def _assert_is_element(self, e):
263        # Need to refer to the actual Python implementation, not the
264        # shadowing C implementation.
265        if not isinstance(e, _Element_Py):
266            raise TypeError('expected an Element, not %s' % type(e).__name__)
267
268    def remove(self, subelement):
269        """Remove matching subelement.
270
271        Unlike the find methods, this method compares elements based on
272        identity, NOT ON tag value or contents.  To remove subelements by
273        other means, the easiest way is to use a list comprehension to
274        select what elements to keep, and then use slice assignment to update
275        the parent element.
276
277        ValueError is raised if a matching element could not be found.
278
279        """
280        # assert iselement(element)
281        self._children.remove(subelement)
282
283    def find(self, path, namespaces=None):
284        """Find first matching element by tag name or path.
285
286        *path* is a string having either an element tag or an XPath,
287        *namespaces* is an optional mapping from namespace prefix to full name.
288
289        Return the first matching element, or None if no element was found.
290
291        """
292        return ElementPath.find(self, path, namespaces)
293
294    def findtext(self, path, default=None, namespaces=None):
295        """Find text for first matching element by tag name or path.
296
297        *path* is a string having either an element tag or an XPath,
298        *default* is the value to return if the element was not found,
299        *namespaces* is an optional mapping from namespace prefix to full name.
300
301        Return text content of first matching element, or default value if
302        none was found.  Note that if an element is found having no text
303        content, the empty string is returned.
304
305        """
306        return ElementPath.findtext(self, path, default, namespaces)
307
308    def findall(self, path, namespaces=None):
309        """Find all matching subelements by tag name or path.
310
311        *path* is a string having either an element tag or an XPath,
312        *namespaces* is an optional mapping from namespace prefix to full name.
313
314        Returns list containing all matching elements in document order.
315
316        """
317        return ElementPath.findall(self, path, namespaces)
318
319    def iterfind(self, path, namespaces=None):
320        """Find all matching subelements by tag name or path.
321
322        *path* is a string having either an element tag or an XPath,
323        *namespaces* is an optional mapping from namespace prefix to full name.
324
325        Return an iterable yielding all matching elements in document order.
326
327        """
328        return ElementPath.iterfind(self, path, namespaces)
329
330    def clear(self):
331        """Reset element.
332
333        This function removes all subelements, clears all attributes, and sets
334        the text and tail attributes to None.
335
336        """
337        self.attrib.clear()
338        self._children = []
339        self.text = self.tail = None
340
341    def get(self, key, default=None):
342        """Get element attribute.
343
344        Equivalent to attrib.get, but some implementations may handle this a
345        bit more efficiently.  *key* is what attribute to look for, and
346        *default* is what to return if the attribute was not found.
347
348        Returns a string containing the attribute value, or the default if
349        attribute was not found.
350
351        """
352        return self.attrib.get(key, default)
353
354    def set(self, key, value):
355        """Set element attribute.
356
357        Equivalent to attrib[key] = value, but some implementations may handle
358        this a bit more efficiently.  *key* is what attribute to set, and
359        *value* is the attribute value to set it to.
360
361        """
362        self.attrib[key] = value
363
364    def keys(self):
365        """Get list of attribute names.
366
367        Names are returned in an arbitrary order, just like an ordinary
368        Python dict.  Equivalent to attrib.keys()
369
370        """
371        return self.attrib.keys()
372
373    def items(self):
374        """Get element attributes as a sequence.
375
376        The attributes are returned in arbitrary order.  Equivalent to
377        attrib.items().
378
379        Return a list of (name, value) tuples.
380
381        """
382        return self.attrib.items()
383
384    def iter(self, tag=None):
385        """Create tree iterator.
386
387        The iterator loops over the element and all subelements in document
388        order, returning all elements with a matching tag.
389
390        If the tree structure is modified during iteration, new or removed
391        elements may or may not be included.  To get a stable set, use the
392        list() function on the iterator, and loop over the resulting list.
393
394        *tag* is what tags to look for (default is to return all elements)
395
396        Return an iterator containing all the matching elements.
397
398        """
399        if tag == "*":
400            tag = None
401        if tag is None or self.tag == tag:
402            yield self
403        for e in self._children:
404            yield from e.iter(tag)
405
406    def itertext(self):
407        """Create text iterator.
408
409        The iterator loops over the element and all subelements in document
410        order, returning all inner text.
411
412        """
413        tag = self.tag
414        if not isinstance(tag, str) and tag is not None:
415            return
416        t = self.text
417        if t:
418            yield t
419        for e in self:
420            yield from e.itertext()
421            t = e.tail
422            if t:
423                yield t
424
425
426def SubElement(parent, tag, attrib={}, **extra):
427    """Subelement factory which creates an element instance, and appends it
428    to an existing parent.
429
430    The element tag, attribute names, and attribute values can be either
431    bytes or Unicode strings.
432
433    *parent* is the parent element, *tag* is the subelements name, *attrib* is
434    an optional directory containing element attributes, *extra* are
435    additional attributes given as keyword arguments.
436
437    """
438    attrib = {**attrib, **extra}
439    element = parent.makeelement(tag, attrib)
440    parent.append(element)
441    return element
442
443
444def Comment(text=None):
445    """Comment element factory.
446
447    This function creates a special element which the standard serializer
448    serializes as an XML comment.
449
450    *text* is a string containing the comment string.
451
452    """
453    element = Element(Comment)
454    element.text = text
455    return element
456
457
458def ProcessingInstruction(target, text=None):
459    """Processing Instruction element factory.
460
461    This function creates a special element which the standard serializer
462    serializes as an XML comment.
463
464    *target* is a string containing the processing instruction, *text* is a
465    string containing the processing instruction contents, if any.
466
467    """
468    element = Element(ProcessingInstruction)
469    element.text = target
470    if text:
471        element.text = element.text + " " + text
472    return element
473
474PI = ProcessingInstruction
475
476
477class QName:
478    """Qualified name wrapper.
479
480    This class can be used to wrap a QName attribute value in order to get
481    proper namespace handing on output.
482
483    *text_or_uri* is a string containing the QName value either in the form
484    {uri}local, or if the tag argument is given, the URI part of a QName.
485
486    *tag* is an optional argument which if given, will make the first
487    argument (text_or_uri) be interpreted as a URI, and this argument (tag)
488    be interpreted as a local name.
489
490    """
491    def __init__(self, text_or_uri, tag=None):
492        if tag:
493            text_or_uri = "{%s}%s" % (text_or_uri, tag)
494        self.text = text_or_uri
495    def __str__(self):
496        return self.text
497    def __repr__(self):
498        return '<%s %r>' % (self.__class__.__name__, self.text)
499    def __hash__(self):
500        return hash(self.text)
501    def __le__(self, other):
502        if isinstance(other, QName):
503            return self.text <= other.text
504        return self.text <= other
505    def __lt__(self, other):
506        if isinstance(other, QName):
507            return self.text < other.text
508        return self.text < other
509    def __ge__(self, other):
510        if isinstance(other, QName):
511            return self.text >= other.text
512        return self.text >= other
513    def __gt__(self, other):
514        if isinstance(other, QName):
515            return self.text > other.text
516        return self.text > other
517    def __eq__(self, other):
518        if isinstance(other, QName):
519            return self.text == other.text
520        return self.text == other
521
522# --------------------------------------------------------------------
523
524
525class ElementTree:
526    """An XML element hierarchy.
527
528    This class also provides support for serialization to and from
529    standard XML.
530
531    *element* is an optional root element node,
532    *file* is an optional file handle or file name of an XML file whose
533    contents will be used to initialize the tree with.
534
535    """
536    def __init__(self, element=None, file=None):
537        # assert element is None or iselement(element)
538        self._root = element # first node
539        if file:
540            self.parse(file)
541
542    def getroot(self):
543        """Return root element of this tree."""
544        return self._root
545
546    def _setroot(self, element):
547        """Replace root element of this tree.
548
549        This will discard the current contents of the tree and replace it
550        with the given element.  Use with care!
551
552        """
553        # assert iselement(element)
554        self._root = element
555
556    def parse(self, source, parser=None):
557        """Load external XML document into element tree.
558
559        *source* is a file name or file object, *parser* is an optional parser
560        instance that defaults to XMLParser.
561
562        ParseError is raised if the parser fails to parse the document.
563
564        Returns the root element of the given source document.
565
566        """
567        close_source = False
568        if not hasattr(source, "read"):
569            source = open(source, "rb")
570            close_source = True
571        try:
572            if parser is None:
573                # If no parser was specified, create a default XMLParser
574                parser = XMLParser()
575                if hasattr(parser, '_parse_whole'):
576                    # The default XMLParser, when it comes from an accelerator,
577                    # can define an internal _parse_whole API for efficiency.
578                    # It can be used to parse the whole source without feeding
579                    # it with chunks.
580                    self._root = parser._parse_whole(source)
581                    return self._root
582            while True:
583                data = source.read(65536)
584                if not data:
585                    break
586                parser.feed(data)
587            self._root = parser.close()
588            return self._root
589        finally:
590            if close_source:
591                source.close()
592
593    def iter(self, tag=None):
594        """Create and return tree iterator for the root element.
595
596        The iterator loops over all elements in this tree, in document order.
597
598        *tag* is a string with the tag name to iterate over
599        (default is to return all elements).
600
601        """
602        # assert self._root is not None
603        return self._root.iter(tag)
604
605    def find(self, path, namespaces=None):
606        """Find first matching element by tag name or path.
607
608        Same as getroot().find(path), which is Element.find()
609
610        *path* is a string having either an element tag or an XPath,
611        *namespaces* is an optional mapping from namespace prefix to full name.
612
613        Return the first matching element, or None if no element was found.
614
615        """
616        # assert self._root is not None
617        if path[:1] == "/":
618            path = "." + path
619            warnings.warn(
620                "This search is broken in 1.3 and earlier, and will be "
621                "fixed in a future version.  If you rely on the current "
622                "behaviour, change it to %r" % path,
623                FutureWarning, stacklevel=2
624                )
625        return self._root.find(path, namespaces)
626
627    def findtext(self, path, default=None, namespaces=None):
628        """Find first matching element by tag name or path.
629
630        Same as getroot().findtext(path),  which is Element.findtext()
631
632        *path* is a string having either an element tag or an XPath,
633        *namespaces* is an optional mapping from namespace prefix to full name.
634
635        Return the first matching element, or None if no element was found.
636
637        """
638        # assert self._root is not None
639        if path[:1] == "/":
640            path = "." + path
641            warnings.warn(
642                "This search is broken in 1.3 and earlier, and will be "
643                "fixed in a future version.  If you rely on the current "
644                "behaviour, change it to %r" % path,
645                FutureWarning, stacklevel=2
646                )
647        return self._root.findtext(path, default, namespaces)
648
649    def findall(self, path, namespaces=None):
650        """Find all matching subelements by tag name or path.
651
652        Same as getroot().findall(path), which is Element.findall().
653
654        *path* is a string having either an element tag or an XPath,
655        *namespaces* is an optional mapping from namespace prefix to full name.
656
657        Return list containing all matching elements in document order.
658
659        """
660        # assert self._root is not None
661        if path[:1] == "/":
662            path = "." + path
663            warnings.warn(
664                "This search is broken in 1.3 and earlier, and will be "
665                "fixed in a future version.  If you rely on the current "
666                "behaviour, change it to %r" % path,
667                FutureWarning, stacklevel=2
668                )
669        return self._root.findall(path, namespaces)
670
671    def iterfind(self, path, namespaces=None):
672        """Find all matching subelements by tag name or path.
673
674        Same as getroot().iterfind(path), which is element.iterfind()
675
676        *path* is a string having either an element tag or an XPath,
677        *namespaces* is an optional mapping from namespace prefix to full name.
678
679        Return an iterable yielding all matching elements in document order.
680
681        """
682        # assert self._root is not None
683        if path[:1] == "/":
684            path = "." + path
685            warnings.warn(
686                "This search is broken in 1.3 and earlier, and will be "
687                "fixed in a future version.  If you rely on the current "
688                "behaviour, change it to %r" % path,
689                FutureWarning, stacklevel=2
690                )
691        return self._root.iterfind(path, namespaces)
692
693    def write(self, file_or_filename,
694              encoding=None,
695              xml_declaration=None,
696              default_namespace=None,
697              method=None, *,
698              short_empty_elements=True):
699        """Write element tree to a file as XML.
700
701        Arguments:
702          *file_or_filename* -- file name or a file object opened for writing
703
704          *encoding* -- the output encoding (default: US-ASCII)
705
706          *xml_declaration* -- bool indicating if an XML declaration should be
707                               added to the output. If None, an XML declaration
708                               is added if encoding IS NOT either of:
709                               US-ASCII, UTF-8, or Unicode
710
711          *default_namespace* -- sets the default XML namespace (for "xmlns")
712
713          *method* -- either "xml" (default), "html, "text", or "c14n"
714
715          *short_empty_elements* -- controls the formatting of elements
716                                    that contain no content. If True (default)
717                                    they are emitted as a single self-closed
718                                    tag, otherwise they are emitted as a pair
719                                    of start/end tags
720
721        """
722        if not method:
723            method = "xml"
724        elif method not in _serialize:
725            raise ValueError("unknown method %r" % method)
726        if not encoding:
727            if method == "c14n":
728                encoding = "utf-8"
729            else:
730                encoding = "us-ascii"
731        enc_lower = encoding.lower()
732        with _get_writer(file_or_filename, enc_lower) as write:
733            if method == "xml" and (xml_declaration or
734                    (xml_declaration is None and
735                     enc_lower not in ("utf-8", "us-ascii", "unicode"))):
736                declared_encoding = encoding
737                if enc_lower == "unicode":
738                    # Retrieve the default encoding for the xml declaration
739                    import locale
740                    declared_encoding = locale.getpreferredencoding()
741                write("<?xml version='1.0' encoding='%s'?>\n" % (
742                    declared_encoding,))
743            if method == "text":
744                _serialize_text(write, self._root)
745            else:
746                qnames, namespaces = _namespaces(self._root, default_namespace)
747                serialize = _serialize[method]
748                serialize(write, self._root, qnames, namespaces,
749                          short_empty_elements=short_empty_elements)
750
751    def write_c14n(self, file):
752        # lxml.etree compatibility.  use output method instead
753        return self.write(file, method="c14n")
754
755# --------------------------------------------------------------------
756# serialization support
757
758@contextlib.contextmanager
759def _get_writer(file_or_filename, encoding):
760    # returns text write method and release all resources after using
761    try:
762        write = file_or_filename.write
763    except AttributeError:
764        # file_or_filename is a file name
765        if encoding == "unicode":
766            file = open(file_or_filename, "w")
767        else:
768            file = open(file_or_filename, "w", encoding=encoding,
769                        errors="xmlcharrefreplace")
770        with file:
771            yield file.write
772    else:
773        # file_or_filename is a file-like object
774        # encoding determines if it is a text or binary writer
775        if encoding == "unicode":
776            # use a text writer as is
777            yield write
778        else:
779            # wrap a binary writer with TextIOWrapper
780            with contextlib.ExitStack() as stack:
781                if isinstance(file_or_filename, io.BufferedIOBase):
782                    file = file_or_filename
783                elif isinstance(file_or_filename, io.RawIOBase):
784                    file = io.BufferedWriter(file_or_filename)
785                    # Keep the original file open when the BufferedWriter is
786                    # destroyed
787                    stack.callback(file.detach)
788                else:
789                    # This is to handle passed objects that aren't in the
790                    # IOBase hierarchy, but just have a write method
791                    file = io.BufferedIOBase()
792                    file.writable = lambda: True
793                    file.write = write
794                    try:
795                        # TextIOWrapper uses this methods to determine
796                        # if BOM (for UTF-16, etc) should be added
797                        file.seekable = file_or_filename.seekable
798                        file.tell = file_or_filename.tell
799                    except AttributeError:
800                        pass
801                file = io.TextIOWrapper(file,
802                                        encoding=encoding,
803                                        errors="xmlcharrefreplace",
804                                        newline="\n")
805                # Keep the original file open when the TextIOWrapper is
806                # destroyed
807                stack.callback(file.detach)
808                yield file.write
809
810def _namespaces(elem, default_namespace=None):
811    # identify namespaces used in this tree
812
813    # maps qnames to *encoded* prefix:local names
814    qnames = {None: None}
815
816    # maps uri:s to prefixes
817    namespaces = {}
818    if default_namespace:
819        namespaces[default_namespace] = ""
820
821    def add_qname(qname):
822        # calculate serialized qname representation
823        try:
824            if qname[:1] == "{":
825                uri, tag = qname[1:].rsplit("}", 1)
826                prefix = namespaces.get(uri)
827                if prefix is None:
828                    prefix = _namespace_map.get(uri)
829                    if prefix is None:
830                        prefix = "ns%d" % len(namespaces)
831                    if prefix != "xml":
832                        namespaces[uri] = prefix
833                if prefix:
834                    qnames[qname] = "%s:%s" % (prefix, tag)
835                else:
836                    qnames[qname] = tag # default element
837            else:
838                if default_namespace:
839                    # FIXME: can this be handled in XML 1.0?
840                    raise ValueError(
841                        "cannot use non-qualified names with "
842                        "default_namespace option"
843                        )
844                qnames[qname] = qname
845        except TypeError:
846            _raise_serialization_error(qname)
847
848    # populate qname and namespaces table
849    for elem in elem.iter():
850        tag = elem.tag
851        if isinstance(tag, QName):
852            if tag.text not in qnames:
853                add_qname(tag.text)
854        elif isinstance(tag, str):
855            if tag not in qnames:
856                add_qname(tag)
857        elif tag is not None and tag is not Comment and tag is not PI:
858            _raise_serialization_error(tag)
859        for key, value in elem.items():
860            if isinstance(key, QName):
861                key = key.text
862            if key not in qnames:
863                add_qname(key)
864            if isinstance(value, QName) and value.text not in qnames:
865                add_qname(value.text)
866        text = elem.text
867        if isinstance(text, QName) and text.text not in qnames:
868            add_qname(text.text)
869    return qnames, namespaces
870
871def _serialize_xml(write, elem, qnames, namespaces,
872                   short_empty_elements, **kwargs):
873    tag = elem.tag
874    text = elem.text
875    if tag is Comment:
876        write("<!--%s-->" % text)
877    elif tag is ProcessingInstruction:
878        write("<?%s?>" % text)
879    else:
880        tag = qnames[tag]
881        if tag is None:
882            if text:
883                write(_escape_cdata(text))
884            for e in elem:
885                _serialize_xml(write, e, qnames, None,
886                               short_empty_elements=short_empty_elements)
887        else:
888            write("<" + tag)
889            items = list(elem.items())
890            if items or namespaces:
891                if namespaces:
892                    for v, k in sorted(namespaces.items(),
893                                       key=lambda x: x[1]):  # sort on prefix
894                        if k:
895                            k = ":" + k
896                        write(" xmlns%s=\"%s\"" % (
897                            k,
898                            _escape_attrib(v)
899                            ))
900                for k, v in items:
901                    if isinstance(k, QName):
902                        k = k.text
903                    if isinstance(v, QName):
904                        v = qnames[v.text]
905                    else:
906                        v = _escape_attrib(v)
907                    write(" %s=\"%s\"" % (qnames[k], v))
908            if text or len(elem) or not short_empty_elements:
909                write(">")
910                if text:
911                    write(_escape_cdata(text))
912                for e in elem:
913                    _serialize_xml(write, e, qnames, None,
914                                   short_empty_elements=short_empty_elements)
915                write("</" + tag + ">")
916            else:
917                write(" />")
918    if elem.tail:
919        write(_escape_cdata(elem.tail))
920
921HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
922              "img", "input", "isindex", "link", "meta", "param")
923
924try:
925    HTML_EMPTY = set(HTML_EMPTY)
926except NameError:
927    pass
928
929def _serialize_html(write, elem, qnames, namespaces, **kwargs):
930    tag = elem.tag
931    text = elem.text
932    if tag is Comment:
933        write("<!--%s-->" % _escape_cdata(text))
934    elif tag is ProcessingInstruction:
935        write("<?%s?>" % _escape_cdata(text))
936    else:
937        tag = qnames[tag]
938        if tag is None:
939            if text:
940                write(_escape_cdata(text))
941            for e in elem:
942                _serialize_html(write, e, qnames, None)
943        else:
944            write("<" + tag)
945            items = list(elem.items())
946            if items or namespaces:
947                if namespaces:
948                    for v, k in sorted(namespaces.items(),
949                                       key=lambda x: x[1]):  # sort on prefix
950                        if k:
951                            k = ":" + k
952                        write(" xmlns%s=\"%s\"" % (
953                            k,
954                            _escape_attrib(v)
955                            ))
956                for k, v in items:
957                    if isinstance(k, QName):
958                        k = k.text
959                    if isinstance(v, QName):
960                        v = qnames[v.text]
961                    else:
962                        v = _escape_attrib_html(v)
963                    # FIXME: handle boolean attributes
964                    write(" %s=\"%s\"" % (qnames[k], v))
965            write(">")
966            ltag = tag.lower()
967            if text:
968                if ltag == "script" or ltag == "style":
969                    write(text)
970                else:
971                    write(_escape_cdata(text))
972            for e in elem:
973                _serialize_html(write, e, qnames, None)
974            if ltag not in HTML_EMPTY:
975                write("</" + tag + ">")
976    if elem.tail:
977        write(_escape_cdata(elem.tail))
978
979def _serialize_text(write, elem):
980    for part in elem.itertext():
981        write(part)
982    if elem.tail:
983        write(elem.tail)
984
985_serialize = {
986    "xml": _serialize_xml,
987    "html": _serialize_html,
988    "text": _serialize_text,
989# this optional method is imported at the end of the module
990#   "c14n": _serialize_c14n,
991}
992
993
994def register_namespace(prefix, uri):
995    """Register a namespace prefix.
996
997    The registry is global, and any existing mapping for either the
998    given prefix or the namespace URI will be removed.
999
1000    *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1001    attributes in this namespace will be serialized with prefix if possible.
1002
1003    ValueError is raised if prefix is reserved or is invalid.
1004
1005    """
1006    if re.match(r"ns\d+$", prefix):
1007        raise ValueError("Prefix format reserved for internal use")
1008    for k, v in list(_namespace_map.items()):
1009        if k == uri or v == prefix:
1010            del _namespace_map[k]
1011    _namespace_map[uri] = prefix
1012
1013_namespace_map = {
1014    # "well-known" namespace prefixes
1015    "http://www.w3.org/XML/1998/namespace": "xml",
1016    "http://www.w3.org/1999/xhtml": "html",
1017    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1018    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1019    # xml schema
1020    "http://www.w3.org/2001/XMLSchema": "xs",
1021    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1022    # dublin core
1023    "http://purl.org/dc/elements/1.1/": "dc",
1024}
1025# For tests and troubleshooting
1026register_namespace._namespace_map = _namespace_map
1027
1028def _raise_serialization_error(text):
1029    raise TypeError(
1030        "cannot serialize %r (type %s)" % (text, type(text).__name__)
1031        )
1032
1033def _escape_cdata(text):
1034    # escape character data
1035    try:
1036        # it's worth avoiding do-nothing calls for strings that are
1037        # shorter than 500 characters, or so.  assume that's, by far,
1038        # the most common case in most applications.
1039        if "&" in text:
1040            text = text.replace("&", "&amp;")
1041        if "<" in text:
1042            text = text.replace("<", "&lt;")
1043        if ">" in text:
1044            text = text.replace(">", "&gt;")
1045        return text
1046    except (TypeError, AttributeError):
1047        _raise_serialization_error(text)
1048
1049def _escape_attrib(text):
1050    # escape attribute value
1051    try:
1052        if "&" in text:
1053            text = text.replace("&", "&amp;")
1054        if "<" in text:
1055            text = text.replace("<", "&lt;")
1056        if ">" in text:
1057            text = text.replace(">", "&gt;")
1058        if "\"" in text:
1059            text = text.replace("\"", "&quot;")
1060        # Although section 2.11 of the XML specification states that CR or
1061        # CR LN should be replaced with just LN, it applies only to EOLNs
1062        # which take part of organizing file into lines. Within attributes,
1063        # we are replacing these with entity numbers, so they do not count.
1064        # http://www.w3.org/TR/REC-xml/#sec-line-ends
1065        # The current solution, contained in following six lines, was
1066        # discussed in issue 17582 and 39011.
1067        if "\r" in text:
1068            text = text.replace("\r", "&#13;")
1069        if "\n" in text:
1070            text = text.replace("\n", "&#10;")
1071        if "\t" in text:
1072            text = text.replace("\t", "&#09;")
1073        return text
1074    except (TypeError, AttributeError):
1075        _raise_serialization_error(text)
1076
1077def _escape_attrib_html(text):
1078    # escape attribute value
1079    try:
1080        if "&" in text:
1081            text = text.replace("&", "&amp;")
1082        if ">" in text:
1083            text = text.replace(">", "&gt;")
1084        if "\"" in text:
1085            text = text.replace("\"", "&quot;")
1086        return text
1087    except (TypeError, AttributeError):
1088        _raise_serialization_error(text)
1089
1090# --------------------------------------------------------------------
1091
1092def tostring(element, encoding=None, method=None, *,
1093             xml_declaration=None, default_namespace=None,
1094             short_empty_elements=True):
1095    """Generate string representation of XML element.
1096
1097    All subelements are included.  If encoding is "unicode", a string
1098    is returned. Otherwise a bytestring is returned.
1099
1100    *element* is an Element instance, *encoding* is an optional output
1101    encoding defaulting to US-ASCII, *method* is an optional output which can
1102    be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1103    sets the default XML namespace (for "xmlns").
1104
1105    Returns an (optionally) encoded string containing the XML data.
1106
1107    """
1108    stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1109    ElementTree(element).write(stream, encoding,
1110                               xml_declaration=xml_declaration,
1111                               default_namespace=default_namespace,
1112                               method=method,
1113                               short_empty_elements=short_empty_elements)
1114    return stream.getvalue()
1115
1116class _ListDataStream(io.BufferedIOBase):
1117    """An auxiliary stream accumulating into a list reference."""
1118    def __init__(self, lst):
1119        self.lst = lst
1120
1121    def writable(self):
1122        return True
1123
1124    def seekable(self):
1125        return True
1126
1127    def write(self, b):
1128        self.lst.append(b)
1129
1130    def tell(self):
1131        return len(self.lst)
1132
1133def tostringlist(element, encoding=None, method=None, *,
1134                 xml_declaration=None, default_namespace=None,
1135                 short_empty_elements=True):
1136    lst = []
1137    stream = _ListDataStream(lst)
1138    ElementTree(element).write(stream, encoding,
1139                               xml_declaration=xml_declaration,
1140                               default_namespace=default_namespace,
1141                               method=method,
1142                               short_empty_elements=short_empty_elements)
1143    return lst
1144
1145
1146def dump(elem):
1147    """Write element tree or element structure to sys.stdout.
1148
1149    This function should be used for debugging only.
1150
1151    *elem* is either an ElementTree, or a single Element.  The exact output
1152    format is implementation dependent.  In this version, it's written as an
1153    ordinary XML file.
1154
1155    """
1156    # debugging
1157    if not isinstance(elem, ElementTree):
1158        elem = ElementTree(elem)
1159    elem.write(sys.stdout, encoding="unicode")
1160    tail = elem.getroot().tail
1161    if not tail or tail[-1] != "\n":
1162        sys.stdout.write("\n")
1163
1164
1165def indent(tree, space="  ", level=0):
1166    """Indent an XML document by inserting newlines and indentation space
1167    after elements.
1168
1169    *tree* is the ElementTree or Element to modify.  The (root) element
1170    itself will not be changed, but the tail text of all elements in its
1171    subtree will be adapted.
1172
1173    *space* is the whitespace to insert for each indentation level, two
1174    space characters by default.
1175
1176    *level* is the initial indentation level. Setting this to a higher
1177    value than 0 can be used for indenting subtrees that are more deeply
1178    nested inside of a document.
1179    """
1180    if isinstance(tree, ElementTree):
1181        tree = tree.getroot()
1182    if level < 0:
1183        raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1184    if not len(tree):
1185        return
1186
1187    # Reduce the memory consumption by reusing indentation strings.
1188    indentations = ["\n" + level * space]
1189
1190    def _indent_children(elem, level):
1191        # Start a new indentation level for the first child.
1192        child_level = level + 1
1193        try:
1194            child_indentation = indentations[child_level]
1195        except IndexError:
1196            child_indentation = indentations[level] + space
1197            indentations.append(child_indentation)
1198
1199        if not elem.text or not elem.text.strip():
1200            elem.text = child_indentation
1201
1202        for child in elem:
1203            if len(child):
1204                _indent_children(child, child_level)
1205            if not child.tail or not child.tail.strip():
1206                child.tail = child_indentation
1207
1208        # Dedent after the last child by overwriting the previous indentation.
1209        if not child.tail.strip():
1210            child.tail = indentations[level]
1211
1212    _indent_children(tree, 0)
1213
1214
1215# --------------------------------------------------------------------
1216# parsing
1217
1218
1219def parse(source, parser=None):
1220    """Parse XML document into element tree.
1221
1222    *source* is a filename or file object containing XML data,
1223    *parser* is an optional parser instance defaulting to XMLParser.
1224
1225    Return an ElementTree instance.
1226
1227    """
1228    tree = ElementTree()
1229    tree.parse(source, parser)
1230    return tree
1231
1232
1233def iterparse(source, events=None, parser=None):
1234    """Incrementally parse XML document into ElementTree.
1235
1236    This class also reports what's going on to the user based on the
1237    *events* it is initialized with.  The supported events are the strings
1238    "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1239    detailed namespace information).  If *events* is omitted, only
1240    "end" events are reported.
1241
1242    *source* is a filename or file object containing XML data, *events* is
1243    a list of events to report back, *parser* is an optional parser instance.
1244
1245    Returns an iterator providing (event, elem) pairs.
1246
1247    """
1248    # Use the internal, undocumented _parser argument for now; When the
1249    # parser argument of iterparse is removed, this can be killed.
1250    pullparser = XMLPullParser(events=events, _parser=parser)
1251    def iterator():
1252        try:
1253            while True:
1254                yield from pullparser.read_events()
1255                # load event buffer
1256                data = source.read(16 * 1024)
1257                if not data:
1258                    break
1259                pullparser.feed(data)
1260            root = pullparser._close_and_return_root()
1261            yield from pullparser.read_events()
1262            it.root = root
1263        finally:
1264            if close_source:
1265                source.close()
1266
1267    class IterParseIterator(collections.abc.Iterator):
1268        __next__ = iterator().__next__
1269    it = IterParseIterator()
1270    it.root = None
1271    del iterator, IterParseIterator
1272
1273    close_source = False
1274    if not hasattr(source, "read"):
1275        source = open(source, "rb")
1276        close_source = True
1277
1278    return it
1279
1280
1281class XMLPullParser:
1282
1283    def __init__(self, events=None, *, _parser=None):
1284        # The _parser argument is for internal use only and must not be relied
1285        # upon in user code. It will be removed in a future release.
1286        # See http://bugs.python.org/issue17741 for more details.
1287
1288        self._events_queue = collections.deque()
1289        self._parser = _parser or XMLParser(target=TreeBuilder())
1290        # wire up the parser for event reporting
1291        if events is None:
1292            events = ("end",)
1293        self._parser._setevents(self._events_queue, events)
1294
1295    def feed(self, data):
1296        """Feed encoded data to parser."""
1297        if self._parser is None:
1298            raise ValueError("feed() called after end of stream")
1299        if data:
1300            try:
1301                self._parser.feed(data)
1302            except SyntaxError as exc:
1303                self._events_queue.append(exc)
1304
1305    def _close_and_return_root(self):
1306        # iterparse needs this to set its root attribute properly :(
1307        root = self._parser.close()
1308        self._parser = None
1309        return root
1310
1311    def close(self):
1312        """Finish feeding data to parser.
1313
1314        Unlike XMLParser, does not return the root element. Use
1315        read_events() to consume elements from XMLPullParser.
1316        """
1317        self._close_and_return_root()
1318
1319    def read_events(self):
1320        """Return an iterator over currently available (event, elem) pairs.
1321
1322        Events are consumed from the internal event queue as they are
1323        retrieved from the iterator.
1324        """
1325        events = self._events_queue
1326        while events:
1327            event = events.popleft()
1328            if isinstance(event, Exception):
1329                raise event
1330            else:
1331                yield event
1332
1333
1334def XML(text, parser=None):
1335    """Parse XML document from string constant.
1336
1337    This function can be used to embed "XML Literals" in Python code.
1338
1339    *text* is a string containing XML data, *parser* is an
1340    optional parser instance, defaulting to the standard XMLParser.
1341
1342    Returns an Element instance.
1343
1344    """
1345    if not parser:
1346        parser = XMLParser(target=TreeBuilder())
1347    parser.feed(text)
1348    return parser.close()
1349
1350
1351def XMLID(text, parser=None):
1352    """Parse XML document from string constant for its IDs.
1353
1354    *text* is a string containing XML data, *parser* is an
1355    optional parser instance, defaulting to the standard XMLParser.
1356
1357    Returns an (Element, dict) tuple, in which the
1358    dict maps element id:s to elements.
1359
1360    """
1361    if not parser:
1362        parser = XMLParser(target=TreeBuilder())
1363    parser.feed(text)
1364    tree = parser.close()
1365    ids = {}
1366    for elem in tree.iter():
1367        id = elem.get("id")
1368        if id:
1369            ids[id] = elem
1370    return tree, ids
1371
1372# Parse XML document from string constant.  Alias for XML().
1373fromstring = XML
1374
1375def fromstringlist(sequence, parser=None):
1376    """Parse XML document from sequence of string fragments.
1377
1378    *sequence* is a list of other sequence, *parser* is an optional parser
1379    instance, defaulting to the standard XMLParser.
1380
1381    Returns an Element instance.
1382
1383    """
1384    if not parser:
1385        parser = XMLParser(target=TreeBuilder())
1386    for text in sequence:
1387        parser.feed(text)
1388    return parser.close()
1389
1390# --------------------------------------------------------------------
1391
1392
1393class TreeBuilder:
1394    """Generic element structure builder.
1395
1396    This builder converts a sequence of start, data, and end method
1397    calls to a well-formed element structure.
1398
1399    You can use this class to build an element structure using a custom XML
1400    parser, or a parser for some other XML-like format.
1401
1402    *element_factory* is an optional element factory which is called
1403    to create new Element instances, as necessary.
1404
1405    *comment_factory* is a factory to create comments to be used instead of
1406    the standard factory.  If *insert_comments* is false (the default),
1407    comments will not be inserted into the tree.
1408
1409    *pi_factory* is a factory to create processing instructions to be used
1410    instead of the standard factory.  If *insert_pis* is false (the default),
1411    processing instructions will not be inserted into the tree.
1412    """
1413    def __init__(self, element_factory=None, *,
1414                 comment_factory=None, pi_factory=None,
1415                 insert_comments=False, insert_pis=False):
1416        self._data = [] # data collector
1417        self._elem = [] # element stack
1418        self._last = None # last element
1419        self._root = None # root element
1420        self._tail = None # true if we're after an end tag
1421        if comment_factory is None:
1422            comment_factory = Comment
1423        self._comment_factory = comment_factory
1424        self.insert_comments = insert_comments
1425        if pi_factory is None:
1426            pi_factory = ProcessingInstruction
1427        self._pi_factory = pi_factory
1428        self.insert_pis = insert_pis
1429        if element_factory is None:
1430            element_factory = Element
1431        self._factory = element_factory
1432
1433    def close(self):
1434        """Flush builder buffers and return toplevel document Element."""
1435        assert len(self._elem) == 0, "missing end tags"
1436        assert self._root is not None, "missing toplevel element"
1437        return self._root
1438
1439    def _flush(self):
1440        if self._data:
1441            if self._last is not None:
1442                text = "".join(self._data)
1443                if self._tail:
1444                    assert self._last.tail is None, "internal error (tail)"
1445                    self._last.tail = text
1446                else:
1447                    assert self._last.text is None, "internal error (text)"
1448                    self._last.text = text
1449            self._data = []
1450
1451    def data(self, data):
1452        """Add text to current element."""
1453        self._data.append(data)
1454
1455    def start(self, tag, attrs):
1456        """Open new element and return it.
1457
1458        *tag* is the element name, *attrs* is a dict containing element
1459        attributes.
1460
1461        """
1462        self._flush()
1463        self._last = elem = self._factory(tag, attrs)
1464        if self._elem:
1465            self._elem[-1].append(elem)
1466        elif self._root is None:
1467            self._root = elem
1468        self._elem.append(elem)
1469        self._tail = 0
1470        return elem
1471
1472    def end(self, tag):
1473        """Close and return current Element.
1474
1475        *tag* is the element name.
1476
1477        """
1478        self._flush()
1479        self._last = self._elem.pop()
1480        assert self._last.tag == tag,\
1481               "end tag mismatch (expected %s, got %s)" % (
1482                   self._last.tag, tag)
1483        self._tail = 1
1484        return self._last
1485
1486    def comment(self, text):
1487        """Create a comment using the comment_factory.
1488
1489        *text* is the text of the comment.
1490        """
1491        return self._handle_single(
1492            self._comment_factory, self.insert_comments, text)
1493
1494    def pi(self, target, text=None):
1495        """Create a processing instruction using the pi_factory.
1496
1497        *target* is the target name of the processing instruction.
1498        *text* is the data of the processing instruction, or ''.
1499        """
1500        return self._handle_single(
1501            self._pi_factory, self.insert_pis, target, text)
1502
1503    def _handle_single(self, factory, insert, *args):
1504        elem = factory(*args)
1505        if insert:
1506            self._flush()
1507            self._last = elem
1508            if self._elem:
1509                self._elem[-1].append(elem)
1510            self._tail = 1
1511        return elem
1512
1513
1514# also see ElementTree and TreeBuilder
1515class XMLParser:
1516    """Element structure builder for XML source data based on the expat parser.
1517
1518    *target* is an optional target object which defaults to an instance of the
1519    standard TreeBuilder class, *encoding* is an optional encoding string
1520    which if given, overrides the encoding specified in the XML file:
1521    http://www.iana.org/assignments/character-sets
1522
1523    """
1524
1525    def __init__(self, *, target=None, encoding=None):
1526        try:
1527            from xml.parsers import expat
1528        except ImportError:
1529            try:
1530                import pyexpat as expat
1531            except ImportError:
1532                raise ImportError(
1533                    "No module named expat; use SimpleXMLTreeBuilder instead"
1534                    )
1535        parser = expat.ParserCreate(encoding, "}")
1536        if target is None:
1537            target = TreeBuilder()
1538        # underscored names are provided for compatibility only
1539        self.parser = self._parser = parser
1540        self.target = self._target = target
1541        self._error = expat.error
1542        self._names = {} # name memo cache
1543        # main callbacks
1544        parser.DefaultHandlerExpand = self._default
1545        if hasattr(target, 'start'):
1546            parser.StartElementHandler = self._start
1547        if hasattr(target, 'end'):
1548            parser.EndElementHandler = self._end
1549        if hasattr(target, 'start_ns'):
1550            parser.StartNamespaceDeclHandler = self._start_ns
1551        if hasattr(target, 'end_ns'):
1552            parser.EndNamespaceDeclHandler = self._end_ns
1553        if hasattr(target, 'data'):
1554            parser.CharacterDataHandler = target.data
1555        # miscellaneous callbacks
1556        if hasattr(target, 'comment'):
1557            parser.CommentHandler = target.comment
1558        if hasattr(target, 'pi'):
1559            parser.ProcessingInstructionHandler = target.pi
1560        # Configure pyexpat: buffering, new-style attribute handling.
1561        parser.buffer_text = 1
1562        parser.ordered_attributes = 1
1563        parser.specified_attributes = 1
1564        self._doctype = None
1565        self.entity = {}
1566        try:
1567            self.version = "Expat %d.%d.%d" % expat.version_info
1568        except AttributeError:
1569            pass # unknown
1570
1571    def _setevents(self, events_queue, events_to_report):
1572        # Internal API for XMLPullParser
1573        # events_to_report: a list of events to report during parsing (same as
1574        # the *events* of XMLPullParser's constructor.
1575        # events_queue: a list of actual parsing events that will be populated
1576        # by the underlying parser.
1577        #
1578        parser = self._parser
1579        append = events_queue.append
1580        for event_name in events_to_report:
1581            if event_name == "start":
1582                parser.ordered_attributes = 1
1583                parser.specified_attributes = 1
1584                def handler(tag, attrib_in, event=event_name, append=append,
1585                            start=self._start):
1586                    append((event, start(tag, attrib_in)))
1587                parser.StartElementHandler = handler
1588            elif event_name == "end":
1589                def handler(tag, event=event_name, append=append,
1590                            end=self._end):
1591                    append((event, end(tag)))
1592                parser.EndElementHandler = handler
1593            elif event_name == "start-ns":
1594                # TreeBuilder does not implement .start_ns()
1595                if hasattr(self.target, "start_ns"):
1596                    def handler(prefix, uri, event=event_name, append=append,
1597                                start_ns=self._start_ns):
1598                        append((event, start_ns(prefix, uri)))
1599                else:
1600                    def handler(prefix, uri, event=event_name, append=append):
1601                        append((event, (prefix or '', uri or '')))
1602                parser.StartNamespaceDeclHandler = handler
1603            elif event_name == "end-ns":
1604                # TreeBuilder does not implement .end_ns()
1605                if hasattr(self.target, "end_ns"):
1606                    def handler(prefix, event=event_name, append=append,
1607                                end_ns=self._end_ns):
1608                        append((event, end_ns(prefix)))
1609                else:
1610                    def handler(prefix, event=event_name, append=append):
1611                        append((event, None))
1612                parser.EndNamespaceDeclHandler = handler
1613            elif event_name == 'comment':
1614                def handler(text, event=event_name, append=append, self=self):
1615                    append((event, self.target.comment(text)))
1616                parser.CommentHandler = handler
1617            elif event_name == 'pi':
1618                def handler(pi_target, data, event=event_name, append=append,
1619                            self=self):
1620                    append((event, self.target.pi(pi_target, data)))
1621                parser.ProcessingInstructionHandler = handler
1622            else:
1623                raise ValueError("unknown event %r" % event_name)
1624
1625    def _raiseerror(self, value):
1626        err = ParseError(value)
1627        err.code = value.code
1628        err.position = value.lineno, value.offset
1629        raise err
1630
1631    def _fixname(self, key):
1632        # expand qname, and convert name string to ascii, if possible
1633        try:
1634            name = self._names[key]
1635        except KeyError:
1636            name = key
1637            if "}" in name:
1638                name = "{" + name
1639            self._names[key] = name
1640        return name
1641
1642    def _start_ns(self, prefix, uri):
1643        return self.target.start_ns(prefix or '', uri or '')
1644
1645    def _end_ns(self, prefix):
1646        return self.target.end_ns(prefix or '')
1647
1648    def _start(self, tag, attr_list):
1649        # Handler for expat's StartElementHandler. Since ordered_attributes
1650        # is set, the attributes are reported as a list of alternating
1651        # attribute name,value.
1652        fixname = self._fixname
1653        tag = fixname(tag)
1654        attrib = {}
1655        if attr_list:
1656            for i in range(0, len(attr_list), 2):
1657                attrib[fixname(attr_list[i])] = attr_list[i+1]
1658        return self.target.start(tag, attrib)
1659
1660    def _end(self, tag):
1661        return self.target.end(self._fixname(tag))
1662
1663    def _default(self, text):
1664        prefix = text[:1]
1665        if prefix == "&":
1666            # deal with undefined entities
1667            try:
1668                data_handler = self.target.data
1669            except AttributeError:
1670                return
1671            try:
1672                data_handler(self.entity[text[1:-1]])
1673            except KeyError:
1674                from xml.parsers import expat
1675                err = expat.error(
1676                    "undefined entity %s: line %d, column %d" %
1677                    (text, self.parser.ErrorLineNumber,
1678                    self.parser.ErrorColumnNumber)
1679                    )
1680                err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1681                err.lineno = self.parser.ErrorLineNumber
1682                err.offset = self.parser.ErrorColumnNumber
1683                raise err
1684        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1685            self._doctype = [] # inside a doctype declaration
1686        elif self._doctype is not None:
1687            # parse doctype contents
1688            if prefix == ">":
1689                self._doctype = None
1690                return
1691            text = text.strip()
1692            if not text:
1693                return
1694            self._doctype.append(text)
1695            n = len(self._doctype)
1696            if n > 2:
1697                type = self._doctype[1]
1698                if type == "PUBLIC" and n == 4:
1699                    name, type, pubid, system = self._doctype
1700                    if pubid:
1701                        pubid = pubid[1:-1]
1702                elif type == "SYSTEM" and n == 3:
1703                    name, type, system = self._doctype
1704                    pubid = None
1705                else:
1706                    return
1707                if hasattr(self.target, "doctype"):
1708                    self.target.doctype(name, pubid, system[1:-1])
1709                elif hasattr(self, "doctype"):
1710                    warnings.warn(
1711                        "The doctype() method of XMLParser is ignored.  "
1712                        "Define doctype() method on the TreeBuilder target.",
1713                        RuntimeWarning)
1714
1715                self._doctype = None
1716
1717    def feed(self, data):
1718        """Feed encoded data to parser."""
1719        try:
1720            self.parser.Parse(data, False)
1721        except self._error as v:
1722            self._raiseerror(v)
1723
1724    def close(self):
1725        """Finish feeding data to parser and return element structure."""
1726        try:
1727            self.parser.Parse(b"", True) # end of data
1728        except self._error as v:
1729            self._raiseerror(v)
1730        try:
1731            close_handler = self.target.close
1732        except AttributeError:
1733            pass
1734        else:
1735            return close_handler()
1736        finally:
1737            # get rid of circular references
1738            del self.parser, self._parser
1739            del self.target, self._target
1740
1741
1742# --------------------------------------------------------------------
1743# C14N 2.0
1744
1745def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1746    """Convert XML to its C14N 2.0 serialised form.
1747
1748    If *out* is provided, it must be a file or file-like object that receives
1749    the serialised canonical XML output (text, not bytes) through its ``.write()``
1750    method.  To write to a file, open it in text mode with encoding "utf-8".
1751    If *out* is not provided, this function returns the output as text string.
1752
1753    Either *xml_data* (an XML string) or *from_file* (a file path or
1754    file-like object) must be provided as input.
1755
1756    The configuration options are the same as for the ``C14NWriterTarget``.
1757    """
1758    if xml_data is None and from_file is None:
1759        raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1760    sio = None
1761    if out is None:
1762        sio = out = io.StringIO()
1763
1764    parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1765
1766    if xml_data is not None:
1767        parser.feed(xml_data)
1768        parser.close()
1769    elif from_file is not None:
1770        parse(from_file, parser=parser)
1771
1772    return sio.getvalue() if sio is not None else None
1773
1774
1775_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1776
1777
1778class C14NWriterTarget:
1779    """
1780    Canonicalization writer target for the XMLParser.
1781
1782    Serialises parse events to XML C14N 2.0.
1783
1784    The *write* function is used for writing out the resulting data stream
1785    as text (not bytes).  To write to a file, open it in text mode with encoding
1786    "utf-8" and pass its ``.write`` method.
1787
1788    Configuration options:
1789
1790    - *with_comments*: set to true to include comments
1791    - *strip_text*: set to true to strip whitespace before and after text content
1792    - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1793    - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1794                          should be replaced in text content
1795    - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1796                           should be replaced in text content
1797    - *exclude_attrs*: a set of attribute names that should not be serialised
1798    - *exclude_tags*: a set of tag names that should not be serialised
1799    """
1800    def __init__(self, write, *,
1801                 with_comments=False, strip_text=False, rewrite_prefixes=False,
1802                 qname_aware_tags=None, qname_aware_attrs=None,
1803                 exclude_attrs=None, exclude_tags=None):
1804        self._write = write
1805        self._data = []
1806        self._with_comments = with_comments
1807        self._strip_text = strip_text
1808        self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1809        self._exclude_tags = set(exclude_tags) if exclude_tags else None
1810
1811        self._rewrite_prefixes = rewrite_prefixes
1812        if qname_aware_tags:
1813            self._qname_aware_tags = set(qname_aware_tags)
1814        else:
1815            self._qname_aware_tags = None
1816        if qname_aware_attrs:
1817            self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1818        else:
1819            self._find_qname_aware_attrs = None
1820
1821        # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1822        self._declared_ns_stack = [[
1823            ("http://www.w3.org/XML/1998/namespace", "xml"),
1824        ]]
1825        # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1826        self._ns_stack = []
1827        if not rewrite_prefixes:
1828            self._ns_stack.append(list(_namespace_map.items()))
1829        self._ns_stack.append([])
1830        self._prefix_map = {}
1831        self._preserve_space = [False]
1832        self._pending_start = None
1833        self._root_seen = False
1834        self._root_done = False
1835        self._ignored_depth = 0
1836
1837    def _iter_namespaces(self, ns_stack, _reversed=reversed):
1838        for namespaces in _reversed(ns_stack):
1839            if namespaces:  # almost no element declares new namespaces
1840                yield from namespaces
1841
1842    def _resolve_prefix_name(self, prefixed_name):
1843        prefix, name = prefixed_name.split(':', 1)
1844        for uri, p in self._iter_namespaces(self._ns_stack):
1845            if p == prefix:
1846                return f'{{{uri}}}{name}'
1847        raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1848
1849    def _qname(self, qname, uri=None):
1850        if uri is None:
1851            uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1852        else:
1853            tag = qname
1854
1855        prefixes_seen = set()
1856        for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1857            if u == uri and prefix not in prefixes_seen:
1858                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1859            prefixes_seen.add(prefix)
1860
1861        # Not declared yet => add new declaration.
1862        if self._rewrite_prefixes:
1863            if uri in self._prefix_map:
1864                prefix = self._prefix_map[uri]
1865            else:
1866                prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1867            self._declared_ns_stack[-1].append((uri, prefix))
1868            return f'{prefix}:{tag}', tag, uri
1869
1870        if not uri and '' not in prefixes_seen:
1871            # No default namespace declared => no prefix needed.
1872            return tag, tag, uri
1873
1874        for u, prefix in self._iter_namespaces(self._ns_stack):
1875            if u == uri:
1876                self._declared_ns_stack[-1].append((uri, prefix))
1877                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1878
1879        if not uri:
1880            # As soon as a default namespace is defined,
1881            # anything that has no namespace (and thus, no prefix) goes there.
1882            return tag, tag, uri
1883
1884        raise ValueError(f'Namespace "{uri}" is not declared in scope')
1885
1886    def data(self, data):
1887        if not self._ignored_depth:
1888            self._data.append(data)
1889
1890    def _flush(self, _join_text=''.join):
1891        data = _join_text(self._data)
1892        del self._data[:]
1893        if self._strip_text and not self._preserve_space[-1]:
1894            data = data.strip()
1895        if self._pending_start is not None:
1896            args, self._pending_start = self._pending_start, None
1897            qname_text = data if data and _looks_like_prefix_name(data) else None
1898            self._start(*args, qname_text)
1899            if qname_text is not None:
1900                return
1901        if data and self._root_seen:
1902            self._write(_escape_cdata_c14n(data))
1903
1904    def start_ns(self, prefix, uri):
1905        if self._ignored_depth:
1906            return
1907        # we may have to resolve qnames in text content
1908        if self._data:
1909            self._flush()
1910        self._ns_stack[-1].append((uri, prefix))
1911
1912    def start(self, tag, attrs):
1913        if self._exclude_tags is not None and (
1914                self._ignored_depth or tag in self._exclude_tags):
1915            self._ignored_depth += 1
1916            return
1917        if self._data:
1918            self._flush()
1919
1920        new_namespaces = []
1921        self._declared_ns_stack.append(new_namespaces)
1922
1923        if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1924            # Need to parse text first to see if it requires a prefix declaration.
1925            self._pending_start = (tag, attrs, new_namespaces)
1926            return
1927        self._start(tag, attrs, new_namespaces)
1928
1929    def _start(self, tag, attrs, new_namespaces, qname_text=None):
1930        if self._exclude_attrs is not None and attrs:
1931            attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1932
1933        qnames = {tag, *attrs}
1934        resolved_names = {}
1935
1936        # Resolve prefixes in attribute and tag text.
1937        if qname_text is not None:
1938            qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1939            qnames.add(qname)
1940        if self._find_qname_aware_attrs is not None and attrs:
1941            qattrs = self._find_qname_aware_attrs(attrs)
1942            if qattrs:
1943                for attr_name in qattrs:
1944                    value = attrs[attr_name]
1945                    if _looks_like_prefix_name(value):
1946                        qname = resolved_names[value] = self._resolve_prefix_name(value)
1947                        qnames.add(qname)
1948            else:
1949                qattrs = None
1950        else:
1951            qattrs = None
1952
1953        # Assign prefixes in lexicographical order of used URIs.
1954        parse_qname = self._qname
1955        parsed_qnames = {n: parse_qname(n) for n in sorted(
1956            qnames, key=lambda n: n.split('}', 1))}
1957
1958        # Write namespace declarations in prefix order ...
1959        if new_namespaces:
1960            attr_list = [
1961                ('xmlns:' + prefix if prefix else 'xmlns', uri)
1962                for uri, prefix in new_namespaces
1963            ]
1964            attr_list.sort()
1965        else:
1966            # almost always empty
1967            attr_list = []
1968
1969        # ... followed by attributes in URI+name order
1970        if attrs:
1971            for k, v in sorted(attrs.items()):
1972                if qattrs is not None and k in qattrs and v in resolved_names:
1973                    v = parsed_qnames[resolved_names[v]][0]
1974                attr_qname, attr_name, uri = parsed_qnames[k]
1975                # No prefix for attributes in default ('') namespace.
1976                attr_list.append((attr_qname if uri else attr_name, v))
1977
1978        # Honour xml:space attributes.
1979        space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1980        self._preserve_space.append(
1981            space_behaviour == 'preserve' if space_behaviour
1982            else self._preserve_space[-1])
1983
1984        # Write the tag.
1985        write = self._write
1986        write('<' + parsed_qnames[tag][0])
1987        if attr_list:
1988            write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1989        write('>')
1990
1991        # Write the resolved qname text content.
1992        if qname_text is not None:
1993            write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1994
1995        self._root_seen = True
1996        self._ns_stack.append([])
1997
1998    def end(self, tag):
1999        if self._ignored_depth:
2000            self._ignored_depth -= 1
2001            return
2002        if self._data:
2003            self._flush()
2004        self._write(f'</{self._qname(tag)[0]}>')
2005        self._preserve_space.pop()
2006        self._root_done = len(self._preserve_space) == 1
2007        self._declared_ns_stack.pop()
2008        self._ns_stack.pop()
2009
2010    def comment(self, text):
2011        if not self._with_comments:
2012            return
2013        if self._ignored_depth:
2014            return
2015        if self._root_done:
2016            self._write('\n')
2017        elif self._root_seen and self._data:
2018            self._flush()
2019        self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2020        if not self._root_seen:
2021            self._write('\n')
2022
2023    def pi(self, target, data):
2024        if self._ignored_depth:
2025            return
2026        if self._root_done:
2027            self._write('\n')
2028        elif self._root_seen and self._data:
2029            self._flush()
2030        self._write(
2031            f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2032        if not self._root_seen:
2033            self._write('\n')
2034
2035
2036def _escape_cdata_c14n(text):
2037    # escape character data
2038    try:
2039        # it's worth avoiding do-nothing calls for strings that are
2040        # shorter than 500 character, or so.  assume that's, by far,
2041        # the most common case in most applications.
2042        if '&' in text:
2043            text = text.replace('&', '&amp;')
2044        if '<' in text:
2045            text = text.replace('<', '&lt;')
2046        if '>' in text:
2047            text = text.replace('>', '&gt;')
2048        if '\r' in text:
2049            text = text.replace('\r', '&#xD;')
2050        return text
2051    except (TypeError, AttributeError):
2052        _raise_serialization_error(text)
2053
2054
2055def _escape_attrib_c14n(text):
2056    # escape attribute value
2057    try:
2058        if '&' in text:
2059            text = text.replace('&', '&amp;')
2060        if '<' in text:
2061            text = text.replace('<', '&lt;')
2062        if '"' in text:
2063            text = text.replace('"', '&quot;')
2064        if '\t' in text:
2065            text = text.replace('\t', '&#x9;')
2066        if '\n' in text:
2067            text = text.replace('\n', '&#xA;')
2068        if '\r' in text:
2069            text = text.replace('\r', '&#xD;')
2070        return text
2071    except (TypeError, AttributeError):
2072        _raise_serialization_error(text)
2073
2074
2075# --------------------------------------------------------------------
2076
2077# Import the C accelerators
2078try:
2079    # Element is going to be shadowed by the C implementation. We need to keep
2080    # the Python version of it accessible for some "creative" by external code
2081    # (see tests)
2082    _Element_Py = Element
2083
2084    # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2085    from _elementtree import *
2086    from _elementtree import _set_factories
2087except ImportError:
2088    pass
2089else:
2090    _set_factories(Comment, ProcessingInstruction)
2091