1"""Lightweight XML support for Python. 2 3 XML is an inherently hierarchical data format, and the most natural way to 4 represent it is with a tree. This module has two classes for this purpose: 5 6 1. ElementTree represents the whole XML document as a tree and 7 8 2. Element represents a single node in this tree. 9 10 Interactions with the whole document (reading and writing to/from files) are 11 usually done on the ElementTree level. Interactions with a single XML element 12 and its sub-elements are done on the Element level. 13 14 Element is a flexible container object designed to store hierarchical data 15 structures in memory. It can be described as a cross between a list and a 16 dictionary. Each Element has a number of properties associated with it: 17 18 'tag' - a string containing the element's name. 19 20 'attributes' - a Python dictionary storing the element's attributes. 21 22 'text' - a string containing the element's text content. 23 24 'tail' - an optional string containing text after the element's end tag. 25 26 And a number of child elements stored in a Python sequence. 27 28 To create an element instance, use the Element constructor, 29 or the SubElement factory function. 30 31 You can also use the ElementTree class to wrap an element structure 32 and convert it to and from XML. 33 34""" 35 36#--------------------------------------------------------------------- 37# Licensed to PSF under a Contributor Agreement. 38# See https://www.python.org/psf/license for licensing details. 39# 40# ElementTree 41# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. 42# 43# fredrik@pythonware.com 44# http://www.pythonware.com 45# -------------------------------------------------------------------- 46# The ElementTree toolkit is 47# 48# Copyright (c) 1999-2008 by Fredrik Lundh 49# 50# By obtaining, using, and/or copying this software and/or its 51# associated documentation, you agree that you have read, understood, 52# and will comply with the following terms and conditions: 53# 54# Permission to use, copy, modify, and distribute this software and 55# its associated documentation for any purpose and without fee is 56# hereby granted, provided that the above copyright notice appears in 57# all copies, and that both that copyright notice and this permission 58# notice appear in supporting documentation, and that the name of 59# Secret Labs AB or the author not be used in advertising or publicity 60# pertaining to distribution of the software without specific, written 61# prior permission. 62# 63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 70# OF THIS SOFTWARE. 71# -------------------------------------------------------------------- 72 73__all__ = [ 74 # public symbols 75 "Comment", 76 "dump", 77 "Element", "ElementTree", 78 "fromstring", "fromstringlist", 79 "indent", "iselement", "iterparse", 80 "parse", "ParseError", 81 "PI", "ProcessingInstruction", 82 "QName", 83 "SubElement", 84 "tostring", "tostringlist", 85 "TreeBuilder", 86 "VERSION", 87 "XML", "XMLID", 88 "XMLParser", "XMLPullParser", 89 "register_namespace", 90 "canonicalize", "C14NWriterTarget", 91 ] 92 93VERSION = "1.3.0" 94 95import sys 96import re 97import warnings 98import io 99import collections 100import collections.abc 101import contextlib 102 103from . import ElementPath 104 105 106class ParseError(SyntaxError): 107 """An error when parsing an XML document. 108 109 In addition to its exception value, a ParseError contains 110 two extra attributes: 111 'code' - the specific exception code 112 'position' - the line and column of the error 113 114 """ 115 pass 116 117# -------------------------------------------------------------------- 118 119 120def iselement(element): 121 """Return True if *element* appears to be an Element.""" 122 return hasattr(element, 'tag') 123 124 125class Element: 126 """An XML element. 127 128 This class is the reference implementation of the Element interface. 129 130 An element's length is its number of subelements. That means if you 131 want to check if an element is truly empty, you should check BOTH 132 its length AND its text attribute. 133 134 The element tag, attribute names, and attribute values can be either 135 bytes or strings. 136 137 *tag* is the element name. *attrib* is an optional dictionary containing 138 element attributes. *extra* are additional element attributes given as 139 keyword arguments. 140 141 Example form: 142 <tag attrib>text<child/>...</tag>tail 143 144 """ 145 146 tag = None 147 """The element's name.""" 148 149 attrib = None 150 """Dictionary of the element's attributes.""" 151 152 text = None 153 """ 154 Text before first subelement. This is either a string or the value None. 155 Note that if there is no text, this attribute may be either 156 None or the empty string, depending on the parser. 157 158 """ 159 160 tail = None 161 """ 162 Text after this element's end tag, but before the next sibling element's 163 start tag. This is either a string or the value None. Note that if there 164 was no text, this attribute may be either None or an empty string, 165 depending on the parser. 166 167 """ 168 169 def __init__(self, tag, attrib={}, **extra): 170 if not isinstance(attrib, dict): 171 raise TypeError("attrib must be dict, not %s" % ( 172 attrib.__class__.__name__,)) 173 self.tag = tag 174 self.attrib = {**attrib, **extra} 175 self._children = [] 176 177 def __repr__(self): 178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self)) 179 180 def makeelement(self, tag, attrib): 181 """Create a new element with the same type. 182 183 *tag* is a string containing the element name. 184 *attrib* is a dictionary containing the element attributes. 185 186 Do not call this method, use the SubElement factory function instead. 187 188 """ 189 return self.__class__(tag, attrib) 190 191 def copy(self): 192 """Return copy of current element. 193 194 This creates a shallow copy. Subelements will be shared with the 195 original tree. 196 197 """ 198 warnings.warn( 199 "elem.copy() is deprecated. Use copy.copy(elem) instead.", 200 DeprecationWarning 201 ) 202 return self.__copy__() 203 204 def __copy__(self): 205 elem = self.makeelement(self.tag, self.attrib) 206 elem.text = self.text 207 elem.tail = self.tail 208 elem[:] = self 209 return elem 210 211 def __len__(self): 212 return len(self._children) 213 214 def __bool__(self): 215 warnings.warn( 216 "The behavior of this method will change in future versions. " 217 "Use specific 'len(elem)' or 'elem is not None' test instead.", 218 FutureWarning, stacklevel=2 219 ) 220 return len(self._children) != 0 # emulate old behaviour, for now 221 222 def __getitem__(self, index): 223 return self._children[index] 224 225 def __setitem__(self, index, element): 226 if isinstance(index, slice): 227 for elt in element: 228 self._assert_is_element(elt) 229 else: 230 self._assert_is_element(element) 231 self._children[index] = element 232 233 def __delitem__(self, index): 234 del self._children[index] 235 236 def append(self, subelement): 237 """Add *subelement* to the end of this element. 238 239 The new element will appear in document order after the last existing 240 subelement (or directly after the text, if it's the first subelement), 241 but before the end tag for this element. 242 243 """ 244 self._assert_is_element(subelement) 245 self._children.append(subelement) 246 247 def extend(self, elements): 248 """Append subelements from a sequence. 249 250 *elements* is a sequence with zero or more elements. 251 252 """ 253 for element in elements: 254 self._assert_is_element(element) 255 self._children.append(element) 256 257 def insert(self, index, subelement): 258 """Insert *subelement* at position *index*.""" 259 self._assert_is_element(subelement) 260 self._children.insert(index, subelement) 261 262 def _assert_is_element(self, e): 263 # Need to refer to the actual Python implementation, not the 264 # shadowing C implementation. 265 if not isinstance(e, _Element_Py): 266 raise TypeError('expected an Element, not %s' % type(e).__name__) 267 268 def remove(self, subelement): 269 """Remove matching subelement. 270 271 Unlike the find methods, this method compares elements based on 272 identity, NOT ON tag value or contents. To remove subelements by 273 other means, the easiest way is to use a list comprehension to 274 select what elements to keep, and then use slice assignment to update 275 the parent element. 276 277 ValueError is raised if a matching element could not be found. 278 279 """ 280 # assert iselement(element) 281 self._children.remove(subelement) 282 283 def find(self, path, namespaces=None): 284 """Find first matching element by tag name or path. 285 286 *path* is a string having either an element tag or an XPath, 287 *namespaces* is an optional mapping from namespace prefix to full name. 288 289 Return the first matching element, or None if no element was found. 290 291 """ 292 return ElementPath.find(self, path, namespaces) 293 294 def findtext(self, path, default=None, namespaces=None): 295 """Find text for first matching element by tag name or path. 296 297 *path* is a string having either an element tag or an XPath, 298 *default* is the value to return if the element was not found, 299 *namespaces* is an optional mapping from namespace prefix to full name. 300 301 Return text content of first matching element, or default value if 302 none was found. Note that if an element is found having no text 303 content, the empty string is returned. 304 305 """ 306 return ElementPath.findtext(self, path, default, namespaces) 307 308 def findall(self, path, namespaces=None): 309 """Find all matching subelements by tag name or path. 310 311 *path* is a string having either an element tag or an XPath, 312 *namespaces* is an optional mapping from namespace prefix to full name. 313 314 Returns list containing all matching elements in document order. 315 316 """ 317 return ElementPath.findall(self, path, namespaces) 318 319 def iterfind(self, path, namespaces=None): 320 """Find all matching subelements by tag name or path. 321 322 *path* is a string having either an element tag or an XPath, 323 *namespaces* is an optional mapping from namespace prefix to full name. 324 325 Return an iterable yielding all matching elements in document order. 326 327 """ 328 return ElementPath.iterfind(self, path, namespaces) 329 330 def clear(self): 331 """Reset element. 332 333 This function removes all subelements, clears all attributes, and sets 334 the text and tail attributes to None. 335 336 """ 337 self.attrib.clear() 338 self._children = [] 339 self.text = self.tail = None 340 341 def get(self, key, default=None): 342 """Get element attribute. 343 344 Equivalent to attrib.get, but some implementations may handle this a 345 bit more efficiently. *key* is what attribute to look for, and 346 *default* is what to return if the attribute was not found. 347 348 Returns a string containing the attribute value, or the default if 349 attribute was not found. 350 351 """ 352 return self.attrib.get(key, default) 353 354 def set(self, key, value): 355 """Set element attribute. 356 357 Equivalent to attrib[key] = value, but some implementations may handle 358 this a bit more efficiently. *key* is what attribute to set, and 359 *value* is the attribute value to set it to. 360 361 """ 362 self.attrib[key] = value 363 364 def keys(self): 365 """Get list of attribute names. 366 367 Names are returned in an arbitrary order, just like an ordinary 368 Python dict. Equivalent to attrib.keys() 369 370 """ 371 return self.attrib.keys() 372 373 def items(self): 374 """Get element attributes as a sequence. 375 376 The attributes are returned in arbitrary order. Equivalent to 377 attrib.items(). 378 379 Return a list of (name, value) tuples. 380 381 """ 382 return self.attrib.items() 383 384 def iter(self, tag=None): 385 """Create tree iterator. 386 387 The iterator loops over the element and all subelements in document 388 order, returning all elements with a matching tag. 389 390 If the tree structure is modified during iteration, new or removed 391 elements may or may not be included. To get a stable set, use the 392 list() function on the iterator, and loop over the resulting list. 393 394 *tag* is what tags to look for (default is to return all elements) 395 396 Return an iterator containing all the matching elements. 397 398 """ 399 if tag == "*": 400 tag = None 401 if tag is None or self.tag == tag: 402 yield self 403 for e in self._children: 404 yield from e.iter(tag) 405 406 def itertext(self): 407 """Create text iterator. 408 409 The iterator loops over the element and all subelements in document 410 order, returning all inner text. 411 412 """ 413 tag = self.tag 414 if not isinstance(tag, str) and tag is not None: 415 return 416 t = self.text 417 if t: 418 yield t 419 for e in self: 420 yield from e.itertext() 421 t = e.tail 422 if t: 423 yield t 424 425 426def SubElement(parent, tag, attrib={}, **extra): 427 """Subelement factory which creates an element instance, and appends it 428 to an existing parent. 429 430 The element tag, attribute names, and attribute values can be either 431 bytes or Unicode strings. 432 433 *parent* is the parent element, *tag* is the subelements name, *attrib* is 434 an optional directory containing element attributes, *extra* are 435 additional attributes given as keyword arguments. 436 437 """ 438 attrib = {**attrib, **extra} 439 element = parent.makeelement(tag, attrib) 440 parent.append(element) 441 return element 442 443 444def Comment(text=None): 445 """Comment element factory. 446 447 This function creates a special element which the standard serializer 448 serializes as an XML comment. 449 450 *text* is a string containing the comment string. 451 452 """ 453 element = Element(Comment) 454 element.text = text 455 return element 456 457 458def ProcessingInstruction(target, text=None): 459 """Processing Instruction element factory. 460 461 This function creates a special element which the standard serializer 462 serializes as an XML comment. 463 464 *target* is a string containing the processing instruction, *text* is a 465 string containing the processing instruction contents, if any. 466 467 """ 468 element = Element(ProcessingInstruction) 469 element.text = target 470 if text: 471 element.text = element.text + " " + text 472 return element 473 474PI = ProcessingInstruction 475 476 477class QName: 478 """Qualified name wrapper. 479 480 This class can be used to wrap a QName attribute value in order to get 481 proper namespace handing on output. 482 483 *text_or_uri* is a string containing the QName value either in the form 484 {uri}local, or if the tag argument is given, the URI part of a QName. 485 486 *tag* is an optional argument which if given, will make the first 487 argument (text_or_uri) be interpreted as a URI, and this argument (tag) 488 be interpreted as a local name. 489 490 """ 491 def __init__(self, text_or_uri, tag=None): 492 if tag: 493 text_or_uri = "{%s}%s" % (text_or_uri, tag) 494 self.text = text_or_uri 495 def __str__(self): 496 return self.text 497 def __repr__(self): 498 return '<%s %r>' % (self.__class__.__name__, self.text) 499 def __hash__(self): 500 return hash(self.text) 501 def __le__(self, other): 502 if isinstance(other, QName): 503 return self.text <= other.text 504 return self.text <= other 505 def __lt__(self, other): 506 if isinstance(other, QName): 507 return self.text < other.text 508 return self.text < other 509 def __ge__(self, other): 510 if isinstance(other, QName): 511 return self.text >= other.text 512 return self.text >= other 513 def __gt__(self, other): 514 if isinstance(other, QName): 515 return self.text > other.text 516 return self.text > other 517 def __eq__(self, other): 518 if isinstance(other, QName): 519 return self.text == other.text 520 return self.text == other 521 522# -------------------------------------------------------------------- 523 524 525class ElementTree: 526 """An XML element hierarchy. 527 528 This class also provides support for serialization to and from 529 standard XML. 530 531 *element* is an optional root element node, 532 *file* is an optional file handle or file name of an XML file whose 533 contents will be used to initialize the tree with. 534 535 """ 536 def __init__(self, element=None, file=None): 537 # assert element is None or iselement(element) 538 self._root = element # first node 539 if file: 540 self.parse(file) 541 542 def getroot(self): 543 """Return root element of this tree.""" 544 return self._root 545 546 def _setroot(self, element): 547 """Replace root element of this tree. 548 549 This will discard the current contents of the tree and replace it 550 with the given element. Use with care! 551 552 """ 553 # assert iselement(element) 554 self._root = element 555 556 def parse(self, source, parser=None): 557 """Load external XML document into element tree. 558 559 *source* is a file name or file object, *parser* is an optional parser 560 instance that defaults to XMLParser. 561 562 ParseError is raised if the parser fails to parse the document. 563 564 Returns the root element of the given source document. 565 566 """ 567 close_source = False 568 if not hasattr(source, "read"): 569 source = open(source, "rb") 570 close_source = True 571 try: 572 if parser is None: 573 # If no parser was specified, create a default XMLParser 574 parser = XMLParser() 575 if hasattr(parser, '_parse_whole'): 576 # The default XMLParser, when it comes from an accelerator, 577 # can define an internal _parse_whole API for efficiency. 578 # It can be used to parse the whole source without feeding 579 # it with chunks. 580 self._root = parser._parse_whole(source) 581 return self._root 582 while True: 583 data = source.read(65536) 584 if not data: 585 break 586 parser.feed(data) 587 self._root = parser.close() 588 return self._root 589 finally: 590 if close_source: 591 source.close() 592 593 def iter(self, tag=None): 594 """Create and return tree iterator for the root element. 595 596 The iterator loops over all elements in this tree, in document order. 597 598 *tag* is a string with the tag name to iterate over 599 (default is to return all elements). 600 601 """ 602 # assert self._root is not None 603 return self._root.iter(tag) 604 605 def find(self, path, namespaces=None): 606 """Find first matching element by tag name or path. 607 608 Same as getroot().find(path), which is Element.find() 609 610 *path* is a string having either an element tag or an XPath, 611 *namespaces* is an optional mapping from namespace prefix to full name. 612 613 Return the first matching element, or None if no element was found. 614 615 """ 616 # assert self._root is not None 617 if path[:1] == "/": 618 path = "." + path 619 warnings.warn( 620 "This search is broken in 1.3 and earlier, and will be " 621 "fixed in a future version. If you rely on the current " 622 "behaviour, change it to %r" % path, 623 FutureWarning, stacklevel=2 624 ) 625 return self._root.find(path, namespaces) 626 627 def findtext(self, path, default=None, namespaces=None): 628 """Find first matching element by tag name or path. 629 630 Same as getroot().findtext(path), which is Element.findtext() 631 632 *path* is a string having either an element tag or an XPath, 633 *namespaces* is an optional mapping from namespace prefix to full name. 634 635 Return the first matching element, or None if no element was found. 636 637 """ 638 # assert self._root is not None 639 if path[:1] == "/": 640 path = "." + path 641 warnings.warn( 642 "This search is broken in 1.3 and earlier, and will be " 643 "fixed in a future version. If you rely on the current " 644 "behaviour, change it to %r" % path, 645 FutureWarning, stacklevel=2 646 ) 647 return self._root.findtext(path, default, namespaces) 648 649 def findall(self, path, namespaces=None): 650 """Find all matching subelements by tag name or path. 651 652 Same as getroot().findall(path), which is Element.findall(). 653 654 *path* is a string having either an element tag or an XPath, 655 *namespaces* is an optional mapping from namespace prefix to full name. 656 657 Return list containing all matching elements in document order. 658 659 """ 660 # assert self._root is not None 661 if path[:1] == "/": 662 path = "." + path 663 warnings.warn( 664 "This search is broken in 1.3 and earlier, and will be " 665 "fixed in a future version. If you rely on the current " 666 "behaviour, change it to %r" % path, 667 FutureWarning, stacklevel=2 668 ) 669 return self._root.findall(path, namespaces) 670 671 def iterfind(self, path, namespaces=None): 672 """Find all matching subelements by tag name or path. 673 674 Same as getroot().iterfind(path), which is element.iterfind() 675 676 *path* is a string having either an element tag or an XPath, 677 *namespaces* is an optional mapping from namespace prefix to full name. 678 679 Return an iterable yielding all matching elements in document order. 680 681 """ 682 # assert self._root is not None 683 if path[:1] == "/": 684 path = "." + path 685 warnings.warn( 686 "This search is broken in 1.3 and earlier, and will be " 687 "fixed in a future version. If you rely on the current " 688 "behaviour, change it to %r" % path, 689 FutureWarning, stacklevel=2 690 ) 691 return self._root.iterfind(path, namespaces) 692 693 def write(self, file_or_filename, 694 encoding=None, 695 xml_declaration=None, 696 default_namespace=None, 697 method=None, *, 698 short_empty_elements=True): 699 """Write element tree to a file as XML. 700 701 Arguments: 702 *file_or_filename* -- file name or a file object opened for writing 703 704 *encoding* -- the output encoding (default: US-ASCII) 705 706 *xml_declaration* -- bool indicating if an XML declaration should be 707 added to the output. If None, an XML declaration 708 is added if encoding IS NOT either of: 709 US-ASCII, UTF-8, or Unicode 710 711 *default_namespace* -- sets the default XML namespace (for "xmlns") 712 713 *method* -- either "xml" (default), "html, "text", or "c14n" 714 715 *short_empty_elements* -- controls the formatting of elements 716 that contain no content. If True (default) 717 they are emitted as a single self-closed 718 tag, otherwise they are emitted as a pair 719 of start/end tags 720 721 """ 722 if not method: 723 method = "xml" 724 elif method not in _serialize: 725 raise ValueError("unknown method %r" % method) 726 if not encoding: 727 if method == "c14n": 728 encoding = "utf-8" 729 else: 730 encoding = "us-ascii" 731 enc_lower = encoding.lower() 732 with _get_writer(file_or_filename, enc_lower) as write: 733 if method == "xml" and (xml_declaration or 734 (xml_declaration is None and 735 enc_lower not in ("utf-8", "us-ascii", "unicode"))): 736 declared_encoding = encoding 737 if enc_lower == "unicode": 738 # Retrieve the default encoding for the xml declaration 739 import locale 740 declared_encoding = locale.getpreferredencoding() 741 write("<?xml version='1.0' encoding='%s'?>\n" % ( 742 declared_encoding,)) 743 if method == "text": 744 _serialize_text(write, self._root) 745 else: 746 qnames, namespaces = _namespaces(self._root, default_namespace) 747 serialize = _serialize[method] 748 serialize(write, self._root, qnames, namespaces, 749 short_empty_elements=short_empty_elements) 750 751 def write_c14n(self, file): 752 # lxml.etree compatibility. use output method instead 753 return self.write(file, method="c14n") 754 755# -------------------------------------------------------------------- 756# serialization support 757 758@contextlib.contextmanager 759def _get_writer(file_or_filename, encoding): 760 # returns text write method and release all resources after using 761 try: 762 write = file_or_filename.write 763 except AttributeError: 764 # file_or_filename is a file name 765 if encoding == "unicode": 766 file = open(file_or_filename, "w") 767 else: 768 file = open(file_or_filename, "w", encoding=encoding, 769 errors="xmlcharrefreplace") 770 with file: 771 yield file.write 772 else: 773 # file_or_filename is a file-like object 774 # encoding determines if it is a text or binary writer 775 if encoding == "unicode": 776 # use a text writer as is 777 yield write 778 else: 779 # wrap a binary writer with TextIOWrapper 780 with contextlib.ExitStack() as stack: 781 if isinstance(file_or_filename, io.BufferedIOBase): 782 file = file_or_filename 783 elif isinstance(file_or_filename, io.RawIOBase): 784 file = io.BufferedWriter(file_or_filename) 785 # Keep the original file open when the BufferedWriter is 786 # destroyed 787 stack.callback(file.detach) 788 else: 789 # This is to handle passed objects that aren't in the 790 # IOBase hierarchy, but just have a write method 791 file = io.BufferedIOBase() 792 file.writable = lambda: True 793 file.write = write 794 try: 795 # TextIOWrapper uses this methods to determine 796 # if BOM (for UTF-16, etc) should be added 797 file.seekable = file_or_filename.seekable 798 file.tell = file_or_filename.tell 799 except AttributeError: 800 pass 801 file = io.TextIOWrapper(file, 802 encoding=encoding, 803 errors="xmlcharrefreplace", 804 newline="\n") 805 # Keep the original file open when the TextIOWrapper is 806 # destroyed 807 stack.callback(file.detach) 808 yield file.write 809 810def _namespaces(elem, default_namespace=None): 811 # identify namespaces used in this tree 812 813 # maps qnames to *encoded* prefix:local names 814 qnames = {None: None} 815 816 # maps uri:s to prefixes 817 namespaces = {} 818 if default_namespace: 819 namespaces[default_namespace] = "" 820 821 def add_qname(qname): 822 # calculate serialized qname representation 823 try: 824 if qname[:1] == "{": 825 uri, tag = qname[1:].rsplit("}", 1) 826 prefix = namespaces.get(uri) 827 if prefix is None: 828 prefix = _namespace_map.get(uri) 829 if prefix is None: 830 prefix = "ns%d" % len(namespaces) 831 if prefix != "xml": 832 namespaces[uri] = prefix 833 if prefix: 834 qnames[qname] = "%s:%s" % (prefix, tag) 835 else: 836 qnames[qname] = tag # default element 837 else: 838 if default_namespace: 839 # FIXME: can this be handled in XML 1.0? 840 raise ValueError( 841 "cannot use non-qualified names with " 842 "default_namespace option" 843 ) 844 qnames[qname] = qname 845 except TypeError: 846 _raise_serialization_error(qname) 847 848 # populate qname and namespaces table 849 for elem in elem.iter(): 850 tag = elem.tag 851 if isinstance(tag, QName): 852 if tag.text not in qnames: 853 add_qname(tag.text) 854 elif isinstance(tag, str): 855 if tag not in qnames: 856 add_qname(tag) 857 elif tag is not None and tag is not Comment and tag is not PI: 858 _raise_serialization_error(tag) 859 for key, value in elem.items(): 860 if isinstance(key, QName): 861 key = key.text 862 if key not in qnames: 863 add_qname(key) 864 if isinstance(value, QName) and value.text not in qnames: 865 add_qname(value.text) 866 text = elem.text 867 if isinstance(text, QName) and text.text not in qnames: 868 add_qname(text.text) 869 return qnames, namespaces 870 871def _serialize_xml(write, elem, qnames, namespaces, 872 short_empty_elements, **kwargs): 873 tag = elem.tag 874 text = elem.text 875 if tag is Comment: 876 write("<!--%s-->" % text) 877 elif tag is ProcessingInstruction: 878 write("<?%s?>" % text) 879 else: 880 tag = qnames[tag] 881 if tag is None: 882 if text: 883 write(_escape_cdata(text)) 884 for e in elem: 885 _serialize_xml(write, e, qnames, None, 886 short_empty_elements=short_empty_elements) 887 else: 888 write("<" + tag) 889 items = list(elem.items()) 890 if items or namespaces: 891 if namespaces: 892 for v, k in sorted(namespaces.items(), 893 key=lambda x: x[1]): # sort on prefix 894 if k: 895 k = ":" + k 896 write(" xmlns%s=\"%s\"" % ( 897 k, 898 _escape_attrib(v) 899 )) 900 for k, v in items: 901 if isinstance(k, QName): 902 k = k.text 903 if isinstance(v, QName): 904 v = qnames[v.text] 905 else: 906 v = _escape_attrib(v) 907 write(" %s=\"%s\"" % (qnames[k], v)) 908 if text or len(elem) or not short_empty_elements: 909 write(">") 910 if text: 911 write(_escape_cdata(text)) 912 for e in elem: 913 _serialize_xml(write, e, qnames, None, 914 short_empty_elements=short_empty_elements) 915 write("</" + tag + ">") 916 else: 917 write(" />") 918 if elem.tail: 919 write(_escape_cdata(elem.tail)) 920 921HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", 922 "img", "input", "isindex", "link", "meta", "param") 923 924try: 925 HTML_EMPTY = set(HTML_EMPTY) 926except NameError: 927 pass 928 929def _serialize_html(write, elem, qnames, namespaces, **kwargs): 930 tag = elem.tag 931 text = elem.text 932 if tag is Comment: 933 write("<!--%s-->" % _escape_cdata(text)) 934 elif tag is ProcessingInstruction: 935 write("<?%s?>" % _escape_cdata(text)) 936 else: 937 tag = qnames[tag] 938 if tag is None: 939 if text: 940 write(_escape_cdata(text)) 941 for e in elem: 942 _serialize_html(write, e, qnames, None) 943 else: 944 write("<" + tag) 945 items = list(elem.items()) 946 if items or namespaces: 947 if namespaces: 948 for v, k in sorted(namespaces.items(), 949 key=lambda x: x[1]): # sort on prefix 950 if k: 951 k = ":" + k 952 write(" xmlns%s=\"%s\"" % ( 953 k, 954 _escape_attrib(v) 955 )) 956 for k, v in items: 957 if isinstance(k, QName): 958 k = k.text 959 if isinstance(v, QName): 960 v = qnames[v.text] 961 else: 962 v = _escape_attrib_html(v) 963 # FIXME: handle boolean attributes 964 write(" %s=\"%s\"" % (qnames[k], v)) 965 write(">") 966 ltag = tag.lower() 967 if text: 968 if ltag == "script" or ltag == "style": 969 write(text) 970 else: 971 write(_escape_cdata(text)) 972 for e in elem: 973 _serialize_html(write, e, qnames, None) 974 if ltag not in HTML_EMPTY: 975 write("</" + tag + ">") 976 if elem.tail: 977 write(_escape_cdata(elem.tail)) 978 979def _serialize_text(write, elem): 980 for part in elem.itertext(): 981 write(part) 982 if elem.tail: 983 write(elem.tail) 984 985_serialize = { 986 "xml": _serialize_xml, 987 "html": _serialize_html, 988 "text": _serialize_text, 989# this optional method is imported at the end of the module 990# "c14n": _serialize_c14n, 991} 992 993 994def register_namespace(prefix, uri): 995 """Register a namespace prefix. 996 997 The registry is global, and any existing mapping for either the 998 given prefix or the namespace URI will be removed. 999 1000 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and 1001 attributes in this namespace will be serialized with prefix if possible. 1002 1003 ValueError is raised if prefix is reserved or is invalid. 1004 1005 """ 1006 if re.match(r"ns\d+$", prefix): 1007 raise ValueError("Prefix format reserved for internal use") 1008 for k, v in list(_namespace_map.items()): 1009 if k == uri or v == prefix: 1010 del _namespace_map[k] 1011 _namespace_map[uri] = prefix 1012 1013_namespace_map = { 1014 # "well-known" namespace prefixes 1015 "http://www.w3.org/XML/1998/namespace": "xml", 1016 "http://www.w3.org/1999/xhtml": "html", 1017 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 1018 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 1019 # xml schema 1020 "http://www.w3.org/2001/XMLSchema": "xs", 1021 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 1022 # dublin core 1023 "http://purl.org/dc/elements/1.1/": "dc", 1024} 1025# For tests and troubleshooting 1026register_namespace._namespace_map = _namespace_map 1027 1028def _raise_serialization_error(text): 1029 raise TypeError( 1030 "cannot serialize %r (type %s)" % (text, type(text).__name__) 1031 ) 1032 1033def _escape_cdata(text): 1034 # escape character data 1035 try: 1036 # it's worth avoiding do-nothing calls for strings that are 1037 # shorter than 500 characters, or so. assume that's, by far, 1038 # the most common case in most applications. 1039 if "&" in text: 1040 text = text.replace("&", "&") 1041 if "<" in text: 1042 text = text.replace("<", "<") 1043 if ">" in text: 1044 text = text.replace(">", ">") 1045 return text 1046 except (TypeError, AttributeError): 1047 _raise_serialization_error(text) 1048 1049def _escape_attrib(text): 1050 # escape attribute value 1051 try: 1052 if "&" in text: 1053 text = text.replace("&", "&") 1054 if "<" in text: 1055 text = text.replace("<", "<") 1056 if ">" in text: 1057 text = text.replace(">", ">") 1058 if "\"" in text: 1059 text = text.replace("\"", """) 1060 # Although section 2.11 of the XML specification states that CR or 1061 # CR LN should be replaced with just LN, it applies only to EOLNs 1062 # which take part of organizing file into lines. Within attributes, 1063 # we are replacing these with entity numbers, so they do not count. 1064 # http://www.w3.org/TR/REC-xml/#sec-line-ends 1065 # The current solution, contained in following six lines, was 1066 # discussed in issue 17582 and 39011. 1067 if "\r" in text: 1068 text = text.replace("\r", " ") 1069 if "\n" in text: 1070 text = text.replace("\n", " ") 1071 if "\t" in text: 1072 text = text.replace("\t", "	") 1073 return text 1074 except (TypeError, AttributeError): 1075 _raise_serialization_error(text) 1076 1077def _escape_attrib_html(text): 1078 # escape attribute value 1079 try: 1080 if "&" in text: 1081 text = text.replace("&", "&") 1082 if ">" in text: 1083 text = text.replace(">", ">") 1084 if "\"" in text: 1085 text = text.replace("\"", """) 1086 return text 1087 except (TypeError, AttributeError): 1088 _raise_serialization_error(text) 1089 1090# -------------------------------------------------------------------- 1091 1092def tostring(element, encoding=None, method=None, *, 1093 xml_declaration=None, default_namespace=None, 1094 short_empty_elements=True): 1095 """Generate string representation of XML element. 1096 1097 All subelements are included. If encoding is "unicode", a string 1098 is returned. Otherwise a bytestring is returned. 1099 1100 *element* is an Element instance, *encoding* is an optional output 1101 encoding defaulting to US-ASCII, *method* is an optional output which can 1102 be one of "xml" (default), "html", "text" or "c14n", *default_namespace* 1103 sets the default XML namespace (for "xmlns"). 1104 1105 Returns an (optionally) encoded string containing the XML data. 1106 1107 """ 1108 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() 1109 ElementTree(element).write(stream, encoding, 1110 xml_declaration=xml_declaration, 1111 default_namespace=default_namespace, 1112 method=method, 1113 short_empty_elements=short_empty_elements) 1114 return stream.getvalue() 1115 1116class _ListDataStream(io.BufferedIOBase): 1117 """An auxiliary stream accumulating into a list reference.""" 1118 def __init__(self, lst): 1119 self.lst = lst 1120 1121 def writable(self): 1122 return True 1123 1124 def seekable(self): 1125 return True 1126 1127 def write(self, b): 1128 self.lst.append(b) 1129 1130 def tell(self): 1131 return len(self.lst) 1132 1133def tostringlist(element, encoding=None, method=None, *, 1134 xml_declaration=None, default_namespace=None, 1135 short_empty_elements=True): 1136 lst = [] 1137 stream = _ListDataStream(lst) 1138 ElementTree(element).write(stream, encoding, 1139 xml_declaration=xml_declaration, 1140 default_namespace=default_namespace, 1141 method=method, 1142 short_empty_elements=short_empty_elements) 1143 return lst 1144 1145 1146def dump(elem): 1147 """Write element tree or element structure to sys.stdout. 1148 1149 This function should be used for debugging only. 1150 1151 *elem* is either an ElementTree, or a single Element. The exact output 1152 format is implementation dependent. In this version, it's written as an 1153 ordinary XML file. 1154 1155 """ 1156 # debugging 1157 if not isinstance(elem, ElementTree): 1158 elem = ElementTree(elem) 1159 elem.write(sys.stdout, encoding="unicode") 1160 tail = elem.getroot().tail 1161 if not tail or tail[-1] != "\n": 1162 sys.stdout.write("\n") 1163 1164 1165def indent(tree, space=" ", level=0): 1166 """Indent an XML document by inserting newlines and indentation space 1167 after elements. 1168 1169 *tree* is the ElementTree or Element to modify. The (root) element 1170 itself will not be changed, but the tail text of all elements in its 1171 subtree will be adapted. 1172 1173 *space* is the whitespace to insert for each indentation level, two 1174 space characters by default. 1175 1176 *level* is the initial indentation level. Setting this to a higher 1177 value than 0 can be used for indenting subtrees that are more deeply 1178 nested inside of a document. 1179 """ 1180 if isinstance(tree, ElementTree): 1181 tree = tree.getroot() 1182 if level < 0: 1183 raise ValueError(f"Initial indentation level must be >= 0, got {level}") 1184 if not len(tree): 1185 return 1186 1187 # Reduce the memory consumption by reusing indentation strings. 1188 indentations = ["\n" + level * space] 1189 1190 def _indent_children(elem, level): 1191 # Start a new indentation level for the first child. 1192 child_level = level + 1 1193 try: 1194 child_indentation = indentations[child_level] 1195 except IndexError: 1196 child_indentation = indentations[level] + space 1197 indentations.append(child_indentation) 1198 1199 if not elem.text or not elem.text.strip(): 1200 elem.text = child_indentation 1201 1202 for child in elem: 1203 if len(child): 1204 _indent_children(child, child_level) 1205 if not child.tail or not child.tail.strip(): 1206 child.tail = child_indentation 1207 1208 # Dedent after the last child by overwriting the previous indentation. 1209 if not child.tail.strip(): 1210 child.tail = indentations[level] 1211 1212 _indent_children(tree, 0) 1213 1214 1215# -------------------------------------------------------------------- 1216# parsing 1217 1218 1219def parse(source, parser=None): 1220 """Parse XML document into element tree. 1221 1222 *source* is a filename or file object containing XML data, 1223 *parser* is an optional parser instance defaulting to XMLParser. 1224 1225 Return an ElementTree instance. 1226 1227 """ 1228 tree = ElementTree() 1229 tree.parse(source, parser) 1230 return tree 1231 1232 1233def iterparse(source, events=None, parser=None): 1234 """Incrementally parse XML document into ElementTree. 1235 1236 This class also reports what's going on to the user based on the 1237 *events* it is initialized with. The supported events are the strings 1238 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get 1239 detailed namespace information). If *events* is omitted, only 1240 "end" events are reported. 1241 1242 *source* is a filename or file object containing XML data, *events* is 1243 a list of events to report back, *parser* is an optional parser instance. 1244 1245 Returns an iterator providing (event, elem) pairs. 1246 1247 """ 1248 # Use the internal, undocumented _parser argument for now; When the 1249 # parser argument of iterparse is removed, this can be killed. 1250 pullparser = XMLPullParser(events=events, _parser=parser) 1251 def iterator(): 1252 try: 1253 while True: 1254 yield from pullparser.read_events() 1255 # load event buffer 1256 data = source.read(16 * 1024) 1257 if not data: 1258 break 1259 pullparser.feed(data) 1260 root = pullparser._close_and_return_root() 1261 yield from pullparser.read_events() 1262 it.root = root 1263 finally: 1264 if close_source: 1265 source.close() 1266 1267 class IterParseIterator(collections.abc.Iterator): 1268 __next__ = iterator().__next__ 1269 it = IterParseIterator() 1270 it.root = None 1271 del iterator, IterParseIterator 1272 1273 close_source = False 1274 if not hasattr(source, "read"): 1275 source = open(source, "rb") 1276 close_source = True 1277 1278 return it 1279 1280 1281class XMLPullParser: 1282 1283 def __init__(self, events=None, *, _parser=None): 1284 # The _parser argument is for internal use only and must not be relied 1285 # upon in user code. It will be removed in a future release. 1286 # See https://bugs.python.org/issue17741 for more details. 1287 1288 self._events_queue = collections.deque() 1289 self._parser = _parser or XMLParser(target=TreeBuilder()) 1290 # wire up the parser for event reporting 1291 if events is None: 1292 events = ("end",) 1293 self._parser._setevents(self._events_queue, events) 1294 1295 def feed(self, data): 1296 """Feed encoded data to parser.""" 1297 if self._parser is None: 1298 raise ValueError("feed() called after end of stream") 1299 if data: 1300 try: 1301 self._parser.feed(data) 1302 except SyntaxError as exc: 1303 self._events_queue.append(exc) 1304 1305 def _close_and_return_root(self): 1306 # iterparse needs this to set its root attribute properly :( 1307 root = self._parser.close() 1308 self._parser = None 1309 return root 1310 1311 def close(self): 1312 """Finish feeding data to parser. 1313 1314 Unlike XMLParser, does not return the root element. Use 1315 read_events() to consume elements from XMLPullParser. 1316 """ 1317 self._close_and_return_root() 1318 1319 def read_events(self): 1320 """Return an iterator over currently available (event, elem) pairs. 1321 1322 Events are consumed from the internal event queue as they are 1323 retrieved from the iterator. 1324 """ 1325 events = self._events_queue 1326 while events: 1327 event = events.popleft() 1328 if isinstance(event, Exception): 1329 raise event 1330 else: 1331 yield event 1332 1333 1334def XML(text, parser=None): 1335 """Parse XML document from string constant. 1336 1337 This function can be used to embed "XML Literals" in Python code. 1338 1339 *text* is a string containing XML data, *parser* is an 1340 optional parser instance, defaulting to the standard XMLParser. 1341 1342 Returns an Element instance. 1343 1344 """ 1345 if not parser: 1346 parser = XMLParser(target=TreeBuilder()) 1347 parser.feed(text) 1348 return parser.close() 1349 1350 1351def XMLID(text, parser=None): 1352 """Parse XML document from string constant for its IDs. 1353 1354 *text* is a string containing XML data, *parser* is an 1355 optional parser instance, defaulting to the standard XMLParser. 1356 1357 Returns an (Element, dict) tuple, in which the 1358 dict maps element id:s to elements. 1359 1360 """ 1361 if not parser: 1362 parser = XMLParser(target=TreeBuilder()) 1363 parser.feed(text) 1364 tree = parser.close() 1365 ids = {} 1366 for elem in tree.iter(): 1367 id = elem.get("id") 1368 if id: 1369 ids[id] = elem 1370 return tree, ids 1371 1372# Parse XML document from string constant. Alias for XML(). 1373fromstring = XML 1374 1375def fromstringlist(sequence, parser=None): 1376 """Parse XML document from sequence of string fragments. 1377 1378 *sequence* is a list of other sequence, *parser* is an optional parser 1379 instance, defaulting to the standard XMLParser. 1380 1381 Returns an Element instance. 1382 1383 """ 1384 if not parser: 1385 parser = XMLParser(target=TreeBuilder()) 1386 for text in sequence: 1387 parser.feed(text) 1388 return parser.close() 1389 1390# -------------------------------------------------------------------- 1391 1392 1393class TreeBuilder: 1394 """Generic element structure builder. 1395 1396 This builder converts a sequence of start, data, and end method 1397 calls to a well-formed element structure. 1398 1399 You can use this class to build an element structure using a custom XML 1400 parser, or a parser for some other XML-like format. 1401 1402 *element_factory* is an optional element factory which is called 1403 to create new Element instances, as necessary. 1404 1405 *comment_factory* is a factory to create comments to be used instead of 1406 the standard factory. If *insert_comments* is false (the default), 1407 comments will not be inserted into the tree. 1408 1409 *pi_factory* is a factory to create processing instructions to be used 1410 instead of the standard factory. If *insert_pis* is false (the default), 1411 processing instructions will not be inserted into the tree. 1412 """ 1413 def __init__(self, element_factory=None, *, 1414 comment_factory=None, pi_factory=None, 1415 insert_comments=False, insert_pis=False): 1416 self._data = [] # data collector 1417 self._elem = [] # element stack 1418 self._last = None # last element 1419 self._root = None # root element 1420 self._tail = None # true if we're after an end tag 1421 if comment_factory is None: 1422 comment_factory = Comment 1423 self._comment_factory = comment_factory 1424 self.insert_comments = insert_comments 1425 if pi_factory is None: 1426 pi_factory = ProcessingInstruction 1427 self._pi_factory = pi_factory 1428 self.insert_pis = insert_pis 1429 if element_factory is None: 1430 element_factory = Element 1431 self._factory = element_factory 1432 1433 def close(self): 1434 """Flush builder buffers and return toplevel document Element.""" 1435 assert len(self._elem) == 0, "missing end tags" 1436 assert self._root is not None, "missing toplevel element" 1437 return self._root 1438 1439 def _flush(self): 1440 if self._data: 1441 if self._last is not None: 1442 text = "".join(self._data) 1443 if self._tail: 1444 assert self._last.tail is None, "internal error (tail)" 1445 self._last.tail = text 1446 else: 1447 assert self._last.text is None, "internal error (text)" 1448 self._last.text = text 1449 self._data = [] 1450 1451 def data(self, data): 1452 """Add text to current element.""" 1453 self._data.append(data) 1454 1455 def start(self, tag, attrs): 1456 """Open new element and return it. 1457 1458 *tag* is the element name, *attrs* is a dict containing element 1459 attributes. 1460 1461 """ 1462 self._flush() 1463 self._last = elem = self._factory(tag, attrs) 1464 if self._elem: 1465 self._elem[-1].append(elem) 1466 elif self._root is None: 1467 self._root = elem 1468 self._elem.append(elem) 1469 self._tail = 0 1470 return elem 1471 1472 def end(self, tag): 1473 """Close and return current Element. 1474 1475 *tag* is the element name. 1476 1477 """ 1478 self._flush() 1479 self._last = self._elem.pop() 1480 assert self._last.tag == tag,\ 1481 "end tag mismatch (expected %s, got %s)" % ( 1482 self._last.tag, tag) 1483 self._tail = 1 1484 return self._last 1485 1486 def comment(self, text): 1487 """Create a comment using the comment_factory. 1488 1489 *text* is the text of the comment. 1490 """ 1491 return self._handle_single( 1492 self._comment_factory, self.insert_comments, text) 1493 1494 def pi(self, target, text=None): 1495 """Create a processing instruction using the pi_factory. 1496 1497 *target* is the target name of the processing instruction. 1498 *text* is the data of the processing instruction, or ''. 1499 """ 1500 return self._handle_single( 1501 self._pi_factory, self.insert_pis, target, text) 1502 1503 def _handle_single(self, factory, insert, *args): 1504 elem = factory(*args) 1505 if insert: 1506 self._flush() 1507 self._last = elem 1508 if self._elem: 1509 self._elem[-1].append(elem) 1510 self._tail = 1 1511 return elem 1512 1513 1514# also see ElementTree and TreeBuilder 1515class XMLParser: 1516 """Element structure builder for XML source data based on the expat parser. 1517 1518 *target* is an optional target object which defaults to an instance of the 1519 standard TreeBuilder class, *encoding* is an optional encoding string 1520 which if given, overrides the encoding specified in the XML file: 1521 http://www.iana.org/assignments/character-sets 1522 1523 """ 1524 1525 def __init__(self, *, target=None, encoding=None): 1526 try: 1527 from xml.parsers import expat 1528 except ImportError: 1529 try: 1530 import pyexpat as expat 1531 except ImportError: 1532 raise ImportError( 1533 "No module named expat; use SimpleXMLTreeBuilder instead" 1534 ) 1535 parser = expat.ParserCreate(encoding, "}") 1536 if target is None: 1537 target = TreeBuilder() 1538 # underscored names are provided for compatibility only 1539 self.parser = self._parser = parser 1540 self.target = self._target = target 1541 self._error = expat.error 1542 self._names = {} # name memo cache 1543 # main callbacks 1544 parser.DefaultHandlerExpand = self._default 1545 if hasattr(target, 'start'): 1546 parser.StartElementHandler = self._start 1547 if hasattr(target, 'end'): 1548 parser.EndElementHandler = self._end 1549 if hasattr(target, 'start_ns'): 1550 parser.StartNamespaceDeclHandler = self._start_ns 1551 if hasattr(target, 'end_ns'): 1552 parser.EndNamespaceDeclHandler = self._end_ns 1553 if hasattr(target, 'data'): 1554 parser.CharacterDataHandler = target.data 1555 # miscellaneous callbacks 1556 if hasattr(target, 'comment'): 1557 parser.CommentHandler = target.comment 1558 if hasattr(target, 'pi'): 1559 parser.ProcessingInstructionHandler = target.pi 1560 # Configure pyexpat: buffering, new-style attribute handling. 1561 parser.buffer_text = 1 1562 parser.ordered_attributes = 1 1563 self._doctype = None 1564 self.entity = {} 1565 try: 1566 self.version = "Expat %d.%d.%d" % expat.version_info 1567 except AttributeError: 1568 pass # unknown 1569 1570 def _setevents(self, events_queue, events_to_report): 1571 # Internal API for XMLPullParser 1572 # events_to_report: a list of events to report during parsing (same as 1573 # the *events* of XMLPullParser's constructor. 1574 # events_queue: a list of actual parsing events that will be populated 1575 # by the underlying parser. 1576 # 1577 parser = self._parser 1578 append = events_queue.append 1579 for event_name in events_to_report: 1580 if event_name == "start": 1581 parser.ordered_attributes = 1 1582 def handler(tag, attrib_in, event=event_name, append=append, 1583 start=self._start): 1584 append((event, start(tag, attrib_in))) 1585 parser.StartElementHandler = handler 1586 elif event_name == "end": 1587 def handler(tag, event=event_name, append=append, 1588 end=self._end): 1589 append((event, end(tag))) 1590 parser.EndElementHandler = handler 1591 elif event_name == "start-ns": 1592 # TreeBuilder does not implement .start_ns() 1593 if hasattr(self.target, "start_ns"): 1594 def handler(prefix, uri, event=event_name, append=append, 1595 start_ns=self._start_ns): 1596 append((event, start_ns(prefix, uri))) 1597 else: 1598 def handler(prefix, uri, event=event_name, append=append): 1599 append((event, (prefix or '', uri or ''))) 1600 parser.StartNamespaceDeclHandler = handler 1601 elif event_name == "end-ns": 1602 # TreeBuilder does not implement .end_ns() 1603 if hasattr(self.target, "end_ns"): 1604 def handler(prefix, event=event_name, append=append, 1605 end_ns=self._end_ns): 1606 append((event, end_ns(prefix))) 1607 else: 1608 def handler(prefix, event=event_name, append=append): 1609 append((event, None)) 1610 parser.EndNamespaceDeclHandler = handler 1611 elif event_name == 'comment': 1612 def handler(text, event=event_name, append=append, self=self): 1613 append((event, self.target.comment(text))) 1614 parser.CommentHandler = handler 1615 elif event_name == 'pi': 1616 def handler(pi_target, data, event=event_name, append=append, 1617 self=self): 1618 append((event, self.target.pi(pi_target, data))) 1619 parser.ProcessingInstructionHandler = handler 1620 else: 1621 raise ValueError("unknown event %r" % event_name) 1622 1623 def _raiseerror(self, value): 1624 err = ParseError(value) 1625 err.code = value.code 1626 err.position = value.lineno, value.offset 1627 raise err 1628 1629 def _fixname(self, key): 1630 # expand qname, and convert name string to ascii, if possible 1631 try: 1632 name = self._names[key] 1633 except KeyError: 1634 name = key 1635 if "}" in name: 1636 name = "{" + name 1637 self._names[key] = name 1638 return name 1639 1640 def _start_ns(self, prefix, uri): 1641 return self.target.start_ns(prefix or '', uri or '') 1642 1643 def _end_ns(self, prefix): 1644 return self.target.end_ns(prefix or '') 1645 1646 def _start(self, tag, attr_list): 1647 # Handler for expat's StartElementHandler. Since ordered_attributes 1648 # is set, the attributes are reported as a list of alternating 1649 # attribute name,value. 1650 fixname = self._fixname 1651 tag = fixname(tag) 1652 attrib = {} 1653 if attr_list: 1654 for i in range(0, len(attr_list), 2): 1655 attrib[fixname(attr_list[i])] = attr_list[i+1] 1656 return self.target.start(tag, attrib) 1657 1658 def _end(self, tag): 1659 return self.target.end(self._fixname(tag)) 1660 1661 def _default(self, text): 1662 prefix = text[:1] 1663 if prefix == "&": 1664 # deal with undefined entities 1665 try: 1666 data_handler = self.target.data 1667 except AttributeError: 1668 return 1669 try: 1670 data_handler(self.entity[text[1:-1]]) 1671 except KeyError: 1672 from xml.parsers import expat 1673 err = expat.error( 1674 "undefined entity %s: line %d, column %d" % 1675 (text, self.parser.ErrorLineNumber, 1676 self.parser.ErrorColumnNumber) 1677 ) 1678 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1679 err.lineno = self.parser.ErrorLineNumber 1680 err.offset = self.parser.ErrorColumnNumber 1681 raise err 1682 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1683 self._doctype = [] # inside a doctype declaration 1684 elif self._doctype is not None: 1685 # parse doctype contents 1686 if prefix == ">": 1687 self._doctype = None 1688 return 1689 text = text.strip() 1690 if not text: 1691 return 1692 self._doctype.append(text) 1693 n = len(self._doctype) 1694 if n > 2: 1695 type = self._doctype[1] 1696 if type == "PUBLIC" and n == 4: 1697 name, type, pubid, system = self._doctype 1698 if pubid: 1699 pubid = pubid[1:-1] 1700 elif type == "SYSTEM" and n == 3: 1701 name, type, system = self._doctype 1702 pubid = None 1703 else: 1704 return 1705 if hasattr(self.target, "doctype"): 1706 self.target.doctype(name, pubid, system[1:-1]) 1707 elif hasattr(self, "doctype"): 1708 warnings.warn( 1709 "The doctype() method of XMLParser is ignored. " 1710 "Define doctype() method on the TreeBuilder target.", 1711 RuntimeWarning) 1712 1713 self._doctype = None 1714 1715 def feed(self, data): 1716 """Feed encoded data to parser.""" 1717 try: 1718 self.parser.Parse(data, False) 1719 except self._error as v: 1720 self._raiseerror(v) 1721 1722 def close(self): 1723 """Finish feeding data to parser and return element structure.""" 1724 try: 1725 self.parser.Parse(b"", True) # end of data 1726 except self._error as v: 1727 self._raiseerror(v) 1728 try: 1729 close_handler = self.target.close 1730 except AttributeError: 1731 pass 1732 else: 1733 return close_handler() 1734 finally: 1735 # get rid of circular references 1736 del self.parser, self._parser 1737 del self.target, self._target 1738 1739 1740# -------------------------------------------------------------------- 1741# C14N 2.0 1742 1743def canonicalize(xml_data=None, *, out=None, from_file=None, **options): 1744 """Convert XML to its C14N 2.0 serialised form. 1745 1746 If *out* is provided, it must be a file or file-like object that receives 1747 the serialised canonical XML output (text, not bytes) through its ``.write()`` 1748 method. To write to a file, open it in text mode with encoding "utf-8". 1749 If *out* is not provided, this function returns the output as text string. 1750 1751 Either *xml_data* (an XML string) or *from_file* (a file path or 1752 file-like object) must be provided as input. 1753 1754 The configuration options are the same as for the ``C14NWriterTarget``. 1755 """ 1756 if xml_data is None and from_file is None: 1757 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") 1758 sio = None 1759 if out is None: 1760 sio = out = io.StringIO() 1761 1762 parser = XMLParser(target=C14NWriterTarget(out.write, **options)) 1763 1764 if xml_data is not None: 1765 parser.feed(xml_data) 1766 parser.close() 1767 elif from_file is not None: 1768 parse(from_file, parser=parser) 1769 1770 return sio.getvalue() if sio is not None else None 1771 1772 1773_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match 1774 1775 1776class C14NWriterTarget: 1777 """ 1778 Canonicalization writer target for the XMLParser. 1779 1780 Serialises parse events to XML C14N 2.0. 1781 1782 The *write* function is used for writing out the resulting data stream 1783 as text (not bytes). To write to a file, open it in text mode with encoding 1784 "utf-8" and pass its ``.write`` method. 1785 1786 Configuration options: 1787 1788 - *with_comments*: set to true to include comments 1789 - *strip_text*: set to true to strip whitespace before and after text content 1790 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" 1791 - *qname_aware_tags*: a set of qname aware tag names in which prefixes 1792 should be replaced in text content 1793 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes 1794 should be replaced in text content 1795 - *exclude_attrs*: a set of attribute names that should not be serialised 1796 - *exclude_tags*: a set of tag names that should not be serialised 1797 """ 1798 def __init__(self, write, *, 1799 with_comments=False, strip_text=False, rewrite_prefixes=False, 1800 qname_aware_tags=None, qname_aware_attrs=None, 1801 exclude_attrs=None, exclude_tags=None): 1802 self._write = write 1803 self._data = [] 1804 self._with_comments = with_comments 1805 self._strip_text = strip_text 1806 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None 1807 self._exclude_tags = set(exclude_tags) if exclude_tags else None 1808 1809 self._rewrite_prefixes = rewrite_prefixes 1810 if qname_aware_tags: 1811 self._qname_aware_tags = set(qname_aware_tags) 1812 else: 1813 self._qname_aware_tags = None 1814 if qname_aware_attrs: 1815 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection 1816 else: 1817 self._find_qname_aware_attrs = None 1818 1819 # Stack with globally and newly declared namespaces as (uri, prefix) pairs. 1820 self._declared_ns_stack = [[ 1821 ("http://www.w3.org/XML/1998/namespace", "xml"), 1822 ]] 1823 # Stack with user declared namespace prefixes as (uri, prefix) pairs. 1824 self._ns_stack = [] 1825 if not rewrite_prefixes: 1826 self._ns_stack.append(list(_namespace_map.items())) 1827 self._ns_stack.append([]) 1828 self._prefix_map = {} 1829 self._preserve_space = [False] 1830 self._pending_start = None 1831 self._root_seen = False 1832 self._root_done = False 1833 self._ignored_depth = 0 1834 1835 def _iter_namespaces(self, ns_stack, _reversed=reversed): 1836 for namespaces in _reversed(ns_stack): 1837 if namespaces: # almost no element declares new namespaces 1838 yield from namespaces 1839 1840 def _resolve_prefix_name(self, prefixed_name): 1841 prefix, name = prefixed_name.split(':', 1) 1842 for uri, p in self._iter_namespaces(self._ns_stack): 1843 if p == prefix: 1844 return f'{{{uri}}}{name}' 1845 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') 1846 1847 def _qname(self, qname, uri=None): 1848 if uri is None: 1849 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) 1850 else: 1851 tag = qname 1852 1853 prefixes_seen = set() 1854 for u, prefix in self._iter_namespaces(self._declared_ns_stack): 1855 if u == uri and prefix not in prefixes_seen: 1856 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1857 prefixes_seen.add(prefix) 1858 1859 # Not declared yet => add new declaration. 1860 if self._rewrite_prefixes: 1861 if uri in self._prefix_map: 1862 prefix = self._prefix_map[uri] 1863 else: 1864 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' 1865 self._declared_ns_stack[-1].append((uri, prefix)) 1866 return f'{prefix}:{tag}', tag, uri 1867 1868 if not uri and '' not in prefixes_seen: 1869 # No default namespace declared => no prefix needed. 1870 return tag, tag, uri 1871 1872 for u, prefix in self._iter_namespaces(self._ns_stack): 1873 if u == uri: 1874 self._declared_ns_stack[-1].append((uri, prefix)) 1875 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1876 1877 if not uri: 1878 # As soon as a default namespace is defined, 1879 # anything that has no namespace (and thus, no prefix) goes there. 1880 return tag, tag, uri 1881 1882 raise ValueError(f'Namespace "{uri}" is not declared in scope') 1883 1884 def data(self, data): 1885 if not self._ignored_depth: 1886 self._data.append(data) 1887 1888 def _flush(self, _join_text=''.join): 1889 data = _join_text(self._data) 1890 del self._data[:] 1891 if self._strip_text and not self._preserve_space[-1]: 1892 data = data.strip() 1893 if self._pending_start is not None: 1894 args, self._pending_start = self._pending_start, None 1895 qname_text = data if data and _looks_like_prefix_name(data) else None 1896 self._start(*args, qname_text) 1897 if qname_text is not None: 1898 return 1899 if data and self._root_seen: 1900 self._write(_escape_cdata_c14n(data)) 1901 1902 def start_ns(self, prefix, uri): 1903 if self._ignored_depth: 1904 return 1905 # we may have to resolve qnames in text content 1906 if self._data: 1907 self._flush() 1908 self._ns_stack[-1].append((uri, prefix)) 1909 1910 def start(self, tag, attrs): 1911 if self._exclude_tags is not None and ( 1912 self._ignored_depth or tag in self._exclude_tags): 1913 self._ignored_depth += 1 1914 return 1915 if self._data: 1916 self._flush() 1917 1918 new_namespaces = [] 1919 self._declared_ns_stack.append(new_namespaces) 1920 1921 if self._qname_aware_tags is not None and tag in self._qname_aware_tags: 1922 # Need to parse text first to see if it requires a prefix declaration. 1923 self._pending_start = (tag, attrs, new_namespaces) 1924 return 1925 self._start(tag, attrs, new_namespaces) 1926 1927 def _start(self, tag, attrs, new_namespaces, qname_text=None): 1928 if self._exclude_attrs is not None and attrs: 1929 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} 1930 1931 qnames = {tag, *attrs} 1932 resolved_names = {} 1933 1934 # Resolve prefixes in attribute and tag text. 1935 if qname_text is not None: 1936 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) 1937 qnames.add(qname) 1938 if self._find_qname_aware_attrs is not None and attrs: 1939 qattrs = self._find_qname_aware_attrs(attrs) 1940 if qattrs: 1941 for attr_name in qattrs: 1942 value = attrs[attr_name] 1943 if _looks_like_prefix_name(value): 1944 qname = resolved_names[value] = self._resolve_prefix_name(value) 1945 qnames.add(qname) 1946 else: 1947 qattrs = None 1948 else: 1949 qattrs = None 1950 1951 # Assign prefixes in lexicographical order of used URIs. 1952 parse_qname = self._qname 1953 parsed_qnames = {n: parse_qname(n) for n in sorted( 1954 qnames, key=lambda n: n.split('}', 1))} 1955 1956 # Write namespace declarations in prefix order ... 1957 if new_namespaces: 1958 attr_list = [ 1959 ('xmlns:' + prefix if prefix else 'xmlns', uri) 1960 for uri, prefix in new_namespaces 1961 ] 1962 attr_list.sort() 1963 else: 1964 # almost always empty 1965 attr_list = [] 1966 1967 # ... followed by attributes in URI+name order 1968 if attrs: 1969 for k, v in sorted(attrs.items()): 1970 if qattrs is not None and k in qattrs and v in resolved_names: 1971 v = parsed_qnames[resolved_names[v]][0] 1972 attr_qname, attr_name, uri = parsed_qnames[k] 1973 # No prefix for attributes in default ('') namespace. 1974 attr_list.append((attr_qname if uri else attr_name, v)) 1975 1976 # Honour xml:space attributes. 1977 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') 1978 self._preserve_space.append( 1979 space_behaviour == 'preserve' if space_behaviour 1980 else self._preserve_space[-1]) 1981 1982 # Write the tag. 1983 write = self._write 1984 write('<' + parsed_qnames[tag][0]) 1985 if attr_list: 1986 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) 1987 write('>') 1988 1989 # Write the resolved qname text content. 1990 if qname_text is not None: 1991 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) 1992 1993 self._root_seen = True 1994 self._ns_stack.append([]) 1995 1996 def end(self, tag): 1997 if self._ignored_depth: 1998 self._ignored_depth -= 1 1999 return 2000 if self._data: 2001 self._flush() 2002 self._write(f'</{self._qname(tag)[0]}>') 2003 self._preserve_space.pop() 2004 self._root_done = len(self._preserve_space) == 1 2005 self._declared_ns_stack.pop() 2006 self._ns_stack.pop() 2007 2008 def comment(self, text): 2009 if not self._with_comments: 2010 return 2011 if self._ignored_depth: 2012 return 2013 if self._root_done: 2014 self._write('\n') 2015 elif self._root_seen and self._data: 2016 self._flush() 2017 self._write(f'<!--{_escape_cdata_c14n(text)}-->') 2018 if not self._root_seen: 2019 self._write('\n') 2020 2021 def pi(self, target, data): 2022 if self._ignored_depth: 2023 return 2024 if self._root_done: 2025 self._write('\n') 2026 elif self._root_seen and self._data: 2027 self._flush() 2028 self._write( 2029 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>') 2030 if not self._root_seen: 2031 self._write('\n') 2032 2033 2034def _escape_cdata_c14n(text): 2035 # escape character data 2036 try: 2037 # it's worth avoiding do-nothing calls for strings that are 2038 # shorter than 500 character, or so. assume that's, by far, 2039 # the most common case in most applications. 2040 if '&' in text: 2041 text = text.replace('&', '&') 2042 if '<' in text: 2043 text = text.replace('<', '<') 2044 if '>' in text: 2045 text = text.replace('>', '>') 2046 if '\r' in text: 2047 text = text.replace('\r', '
') 2048 return text 2049 except (TypeError, AttributeError): 2050 _raise_serialization_error(text) 2051 2052 2053def _escape_attrib_c14n(text): 2054 # escape attribute value 2055 try: 2056 if '&' in text: 2057 text = text.replace('&', '&') 2058 if '<' in text: 2059 text = text.replace('<', '<') 2060 if '"' in text: 2061 text = text.replace('"', '"') 2062 if '\t' in text: 2063 text = text.replace('\t', '	') 2064 if '\n' in text: 2065 text = text.replace('\n', '
') 2066 if '\r' in text: 2067 text = text.replace('\r', '
') 2068 return text 2069 except (TypeError, AttributeError): 2070 _raise_serialization_error(text) 2071 2072 2073# -------------------------------------------------------------------- 2074 2075# Import the C accelerators 2076try: 2077 # Element is going to be shadowed by the C implementation. We need to keep 2078 # the Python version of it accessible for some "creative" by external code 2079 # (see tests) 2080 _Element_Py = Element 2081 2082 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories 2083 from _elementtree import * 2084 from _elementtree import _set_factories 2085except ImportError: 2086 pass 2087else: 2088 _set_factories(Comment, ProcessingInstruction) 2089