1"""Lightweight XML support for Python. 2 3 XML is an inherently hierarchical data format, and the most natural way to 4 represent it is with a tree. This module has two classes for this purpose: 5 6 1. ElementTree represents the whole XML document as a tree and 7 8 2. Element represents a single node in this tree. 9 10 Interactions with the whole document (reading and writing to/from files) are 11 usually done on the ElementTree level. Interactions with a single XML element 12 and its sub-elements are done on the Element level. 13 14 Element is a flexible container object designed to store hierarchical data 15 structures in memory. It can be described as a cross between a list and a 16 dictionary. Each Element has a number of properties associated with it: 17 18 'tag' - a string containing the element's name. 19 20 'attributes' - a Python dictionary storing the element's attributes. 21 22 'text' - a string containing the element's text content. 23 24 'tail' - an optional string containing text after the element's end tag. 25 26 And a number of child elements stored in a Python sequence. 27 28 To create an element instance, use the Element constructor, 29 or the SubElement factory function. 30 31 You can also use the ElementTree class to wrap an element structure 32 and convert it to and from XML. 33 34""" 35 36#--------------------------------------------------------------------- 37# Licensed to PSF under a Contributor Agreement. 38# See https://www.python.org/psf/license for licensing details. 39# 40# ElementTree 41# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. 42# 43# fredrik@pythonware.com 44# http://www.pythonware.com 45# -------------------------------------------------------------------- 46# The ElementTree toolkit is 47# 48# Copyright (c) 1999-2008 by Fredrik Lundh 49# 50# By obtaining, using, and/or copying this software and/or its 51# associated documentation, you agree that you have read, understood, 52# and will comply with the following terms and conditions: 53# 54# Permission to use, copy, modify, and distribute this software and 55# its associated documentation for any purpose and without fee is 56# hereby granted, provided that the above copyright notice appears in 57# all copies, and that both that copyright notice and this permission 58# notice appear in supporting documentation, and that the name of 59# Secret Labs AB or the author not be used in advertising or publicity 60# pertaining to distribution of the software without specific, written 61# prior permission. 62# 63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 70# OF THIS SOFTWARE. 71# -------------------------------------------------------------------- 72 73__all__ = [ 74 # public symbols 75 "Comment", 76 "dump", 77 "Element", "ElementTree", 78 "fromstring", "fromstringlist", 79 "indent", "iselement", "iterparse", 80 "parse", "ParseError", 81 "PI", "ProcessingInstruction", 82 "QName", 83 "SubElement", 84 "tostring", "tostringlist", 85 "TreeBuilder", 86 "VERSION", 87 "XML", "XMLID", 88 "XMLParser", "XMLPullParser", 89 "register_namespace", 90 "canonicalize", "C14NWriterTarget", 91 ] 92 93VERSION = "1.3.0" 94 95import sys 96import re 97import warnings 98import io 99import collections 100import collections.abc 101import contextlib 102import weakref 103 104from . import ElementPath 105 106 107class ParseError(SyntaxError): 108 """An error when parsing an XML document. 109 110 In addition to its exception value, a ParseError contains 111 two extra attributes: 112 'code' - the specific exception code 113 'position' - the line and column of the error 114 115 """ 116 pass 117 118# -------------------------------------------------------------------- 119 120 121def iselement(element): 122 """Return True if *element* appears to be an Element.""" 123 return hasattr(element, 'tag') 124 125 126class Element: 127 """An XML element. 128 129 This class is the reference implementation of the Element interface. 130 131 An element's length is its number of subelements. That means if you 132 want to check if an element is truly empty, you should check BOTH 133 its length AND its text attribute. 134 135 The element tag, attribute names, and attribute values can be either 136 bytes or strings. 137 138 *tag* is the element name. *attrib* is an optional dictionary containing 139 element attributes. *extra* are additional element attributes given as 140 keyword arguments. 141 142 Example form: 143 <tag attrib>text<child/>...</tag>tail 144 145 """ 146 147 tag = None 148 """The element's name.""" 149 150 attrib = None 151 """Dictionary of the element's attributes.""" 152 153 text = None 154 """ 155 Text before first subelement. This is either a string or the value None. 156 Note that if there is no text, this attribute may be either 157 None or the empty string, depending on the parser. 158 159 """ 160 161 tail = None 162 """ 163 Text after this element's end tag, but before the next sibling element's 164 start tag. This is either a string or the value None. Note that if there 165 was no text, this attribute may be either None or an empty string, 166 depending on the parser. 167 168 """ 169 170 def __init__(self, tag, attrib={}, **extra): 171 if not isinstance(attrib, dict): 172 raise TypeError("attrib must be dict, not %s" % ( 173 attrib.__class__.__name__,)) 174 self.tag = tag 175 self.attrib = {**attrib, **extra} 176 self._children = [] 177 178 def __repr__(self): 179 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self)) 180 181 def makeelement(self, tag, attrib): 182 """Create a new element with the same type. 183 184 *tag* is a string containing the element name. 185 *attrib* is a dictionary containing the element attributes. 186 187 Do not call this method, use the SubElement factory function instead. 188 189 """ 190 return self.__class__(tag, attrib) 191 192 def __copy__(self): 193 elem = self.makeelement(self.tag, self.attrib) 194 elem.text = self.text 195 elem.tail = self.tail 196 elem[:] = self 197 return elem 198 199 def __len__(self): 200 return len(self._children) 201 202 def __bool__(self): 203 warnings.warn( 204 "Testing an element's truth value will always return True in " 205 "future versions. " 206 "Use specific 'len(elem)' or 'elem is not None' test instead.", 207 DeprecationWarning, stacklevel=2 208 ) 209 return len(self._children) != 0 # emulate old behaviour, for now 210 211 def __getitem__(self, index): 212 return self._children[index] 213 214 def __setitem__(self, index, element): 215 if isinstance(index, slice): 216 for elt in element: 217 self._assert_is_element(elt) 218 else: 219 self._assert_is_element(element) 220 self._children[index] = element 221 222 def __delitem__(self, index): 223 del self._children[index] 224 225 def append(self, subelement): 226 """Add *subelement* to the end of this element. 227 228 The new element will appear in document order after the last existing 229 subelement (or directly after the text, if it's the first subelement), 230 but before the end tag for this element. 231 232 """ 233 self._assert_is_element(subelement) 234 self._children.append(subelement) 235 236 def extend(self, elements): 237 """Append subelements from a sequence. 238 239 *elements* is a sequence with zero or more elements. 240 241 """ 242 for element in elements: 243 self._assert_is_element(element) 244 self._children.append(element) 245 246 def insert(self, index, subelement): 247 """Insert *subelement* at position *index*.""" 248 self._assert_is_element(subelement) 249 self._children.insert(index, subelement) 250 251 def _assert_is_element(self, e): 252 # Need to refer to the actual Python implementation, not the 253 # shadowing C implementation. 254 if not isinstance(e, _Element_Py): 255 raise TypeError('expected an Element, not %s' % type(e).__name__) 256 257 def remove(self, subelement): 258 """Remove matching subelement. 259 260 Unlike the find methods, this method compares elements based on 261 identity, NOT ON tag value or contents. To remove subelements by 262 other means, the easiest way is to use a list comprehension to 263 select what elements to keep, and then use slice assignment to update 264 the parent element. 265 266 ValueError is raised if a matching element could not be found. 267 268 """ 269 # assert iselement(element) 270 self._children.remove(subelement) 271 272 def find(self, path, namespaces=None): 273 """Find first matching element by tag name or path. 274 275 *path* is a string having either an element tag or an XPath, 276 *namespaces* is an optional mapping from namespace prefix to full name. 277 278 Return the first matching element, or None if no element was found. 279 280 """ 281 return ElementPath.find(self, path, namespaces) 282 283 def findtext(self, path, default=None, namespaces=None): 284 """Find text for first matching element by tag name or path. 285 286 *path* is a string having either an element tag or an XPath, 287 *default* is the value to return if the element was not found, 288 *namespaces* is an optional mapping from namespace prefix to full name. 289 290 Return text content of first matching element, or default value if 291 none was found. Note that if an element is found having no text 292 content, the empty string is returned. 293 294 """ 295 return ElementPath.findtext(self, path, default, namespaces) 296 297 def findall(self, path, namespaces=None): 298 """Find all matching subelements by tag name or path. 299 300 *path* is a string having either an element tag or an XPath, 301 *namespaces* is an optional mapping from namespace prefix to full name. 302 303 Returns list containing all matching elements in document order. 304 305 """ 306 return ElementPath.findall(self, path, namespaces) 307 308 def iterfind(self, path, namespaces=None): 309 """Find all matching subelements by tag name or path. 310 311 *path* is a string having either an element tag or an XPath, 312 *namespaces* is an optional mapping from namespace prefix to full name. 313 314 Return an iterable yielding all matching elements in document order. 315 316 """ 317 return ElementPath.iterfind(self, path, namespaces) 318 319 def clear(self): 320 """Reset element. 321 322 This function removes all subelements, clears all attributes, and sets 323 the text and tail attributes to None. 324 325 """ 326 self.attrib.clear() 327 self._children = [] 328 self.text = self.tail = None 329 330 def get(self, key, default=None): 331 """Get element attribute. 332 333 Equivalent to attrib.get, but some implementations may handle this a 334 bit more efficiently. *key* is what attribute to look for, and 335 *default* is what to return if the attribute was not found. 336 337 Returns a string containing the attribute value, or the default if 338 attribute was not found. 339 340 """ 341 return self.attrib.get(key, default) 342 343 def set(self, key, value): 344 """Set element attribute. 345 346 Equivalent to attrib[key] = value, but some implementations may handle 347 this a bit more efficiently. *key* is what attribute to set, and 348 *value* is the attribute value to set it to. 349 350 """ 351 self.attrib[key] = value 352 353 def keys(self): 354 """Get list of attribute names. 355 356 Names are returned in an arbitrary order, just like an ordinary 357 Python dict. Equivalent to attrib.keys() 358 359 """ 360 return self.attrib.keys() 361 362 def items(self): 363 """Get element attributes as a sequence. 364 365 The attributes are returned in arbitrary order. Equivalent to 366 attrib.items(). 367 368 Return a list of (name, value) tuples. 369 370 """ 371 return self.attrib.items() 372 373 def iter(self, tag=None): 374 """Create tree iterator. 375 376 The iterator loops over the element and all subelements in document 377 order, returning all elements with a matching tag. 378 379 If the tree structure is modified during iteration, new or removed 380 elements may or may not be included. To get a stable set, use the 381 list() function on the iterator, and loop over the resulting list. 382 383 *tag* is what tags to look for (default is to return all elements) 384 385 Return an iterator containing all the matching elements. 386 387 """ 388 if tag == "*": 389 tag = None 390 if tag is None or self.tag == tag: 391 yield self 392 for e in self._children: 393 yield from e.iter(tag) 394 395 def itertext(self): 396 """Create text iterator. 397 398 The iterator loops over the element and all subelements in document 399 order, returning all inner text. 400 401 """ 402 tag = self.tag 403 if not isinstance(tag, str) and tag is not None: 404 return 405 t = self.text 406 if t: 407 yield t 408 for e in self: 409 yield from e.itertext() 410 t = e.tail 411 if t: 412 yield t 413 414 415def SubElement(parent, tag, attrib={}, **extra): 416 """Subelement factory which creates an element instance, and appends it 417 to an existing parent. 418 419 The element tag, attribute names, and attribute values can be either 420 bytes or Unicode strings. 421 422 *parent* is the parent element, *tag* is the subelements name, *attrib* is 423 an optional directory containing element attributes, *extra* are 424 additional attributes given as keyword arguments. 425 426 """ 427 attrib = {**attrib, **extra} 428 element = parent.makeelement(tag, attrib) 429 parent.append(element) 430 return element 431 432 433def Comment(text=None): 434 """Comment element factory. 435 436 This function creates a special element which the standard serializer 437 serializes as an XML comment. 438 439 *text* is a string containing the comment string. 440 441 """ 442 element = Element(Comment) 443 element.text = text 444 return element 445 446 447def ProcessingInstruction(target, text=None): 448 """Processing Instruction element factory. 449 450 This function creates a special element which the standard serializer 451 serializes as an XML comment. 452 453 *target* is a string containing the processing instruction, *text* is a 454 string containing the processing instruction contents, if any. 455 456 """ 457 element = Element(ProcessingInstruction) 458 element.text = target 459 if text: 460 element.text = element.text + " " + text 461 return element 462 463PI = ProcessingInstruction 464 465 466class QName: 467 """Qualified name wrapper. 468 469 This class can be used to wrap a QName attribute value in order to get 470 proper namespace handing on output. 471 472 *text_or_uri* is a string containing the QName value either in the form 473 {uri}local, or if the tag argument is given, the URI part of a QName. 474 475 *tag* is an optional argument which if given, will make the first 476 argument (text_or_uri) be interpreted as a URI, and this argument (tag) 477 be interpreted as a local name. 478 479 """ 480 def __init__(self, text_or_uri, tag=None): 481 if tag: 482 text_or_uri = "{%s}%s" % (text_or_uri, tag) 483 self.text = text_or_uri 484 def __str__(self): 485 return self.text 486 def __repr__(self): 487 return '<%s %r>' % (self.__class__.__name__, self.text) 488 def __hash__(self): 489 return hash(self.text) 490 def __le__(self, other): 491 if isinstance(other, QName): 492 return self.text <= other.text 493 return self.text <= other 494 def __lt__(self, other): 495 if isinstance(other, QName): 496 return self.text < other.text 497 return self.text < other 498 def __ge__(self, other): 499 if isinstance(other, QName): 500 return self.text >= other.text 501 return self.text >= other 502 def __gt__(self, other): 503 if isinstance(other, QName): 504 return self.text > other.text 505 return self.text > other 506 def __eq__(self, other): 507 if isinstance(other, QName): 508 return self.text == other.text 509 return self.text == other 510 511# -------------------------------------------------------------------- 512 513 514class ElementTree: 515 """An XML element hierarchy. 516 517 This class also provides support for serialization to and from 518 standard XML. 519 520 *element* is an optional root element node, 521 *file* is an optional file handle or file name of an XML file whose 522 contents will be used to initialize the tree with. 523 524 """ 525 def __init__(self, element=None, file=None): 526 # assert element is None or iselement(element) 527 self._root = element # first node 528 if file: 529 self.parse(file) 530 531 def getroot(self): 532 """Return root element of this tree.""" 533 return self._root 534 535 def _setroot(self, element): 536 """Replace root element of this tree. 537 538 This will discard the current contents of the tree and replace it 539 with the given element. Use with care! 540 541 """ 542 # assert iselement(element) 543 self._root = element 544 545 def parse(self, source, parser=None): 546 """Load external XML document into element tree. 547 548 *source* is a file name or file object, *parser* is an optional parser 549 instance that defaults to XMLParser. 550 551 ParseError is raised if the parser fails to parse the document. 552 553 Returns the root element of the given source document. 554 555 """ 556 close_source = False 557 if not hasattr(source, "read"): 558 source = open(source, "rb") 559 close_source = True 560 try: 561 if parser is None: 562 # If no parser was specified, create a default XMLParser 563 parser = XMLParser() 564 if hasattr(parser, '_parse_whole'): 565 # The default XMLParser, when it comes from an accelerator, 566 # can define an internal _parse_whole API for efficiency. 567 # It can be used to parse the whole source without feeding 568 # it with chunks. 569 self._root = parser._parse_whole(source) 570 return self._root 571 while data := source.read(65536): 572 parser.feed(data) 573 self._root = parser.close() 574 return self._root 575 finally: 576 if close_source: 577 source.close() 578 579 def iter(self, tag=None): 580 """Create and return tree iterator for the root element. 581 582 The iterator loops over all elements in this tree, in document order. 583 584 *tag* is a string with the tag name to iterate over 585 (default is to return all elements). 586 587 """ 588 # assert self._root is not None 589 return self._root.iter(tag) 590 591 def find(self, path, namespaces=None): 592 """Find first matching element by tag name or path. 593 594 Same as getroot().find(path), which is Element.find() 595 596 *path* is a string having either an element tag or an XPath, 597 *namespaces* is an optional mapping from namespace prefix to full name. 598 599 Return the first matching element, or None if no element was found. 600 601 """ 602 # assert self._root is not None 603 if path[:1] == "/": 604 path = "." + path 605 warnings.warn( 606 "This search is broken in 1.3 and earlier, and will be " 607 "fixed in a future version. If you rely on the current " 608 "behaviour, change it to %r" % path, 609 FutureWarning, stacklevel=2 610 ) 611 return self._root.find(path, namespaces) 612 613 def findtext(self, path, default=None, namespaces=None): 614 """Find first matching element by tag name or path. 615 616 Same as getroot().findtext(path), which is Element.findtext() 617 618 *path* is a string having either an element tag or an XPath, 619 *namespaces* is an optional mapping from namespace prefix to full name. 620 621 Return the first matching element, or None if no element was found. 622 623 """ 624 # assert self._root is not None 625 if path[:1] == "/": 626 path = "." + path 627 warnings.warn( 628 "This search is broken in 1.3 and earlier, and will be " 629 "fixed in a future version. If you rely on the current " 630 "behaviour, change it to %r" % path, 631 FutureWarning, stacklevel=2 632 ) 633 return self._root.findtext(path, default, namespaces) 634 635 def findall(self, path, namespaces=None): 636 """Find all matching subelements by tag name or path. 637 638 Same as getroot().findall(path), which is Element.findall(). 639 640 *path* is a string having either an element tag or an XPath, 641 *namespaces* is an optional mapping from namespace prefix to full name. 642 643 Return list containing all matching elements in document order. 644 645 """ 646 # assert self._root is not None 647 if path[:1] == "/": 648 path = "." + path 649 warnings.warn( 650 "This search is broken in 1.3 and earlier, and will be " 651 "fixed in a future version. If you rely on the current " 652 "behaviour, change it to %r" % path, 653 FutureWarning, stacklevel=2 654 ) 655 return self._root.findall(path, namespaces) 656 657 def iterfind(self, path, namespaces=None): 658 """Find all matching subelements by tag name or path. 659 660 Same as getroot().iterfind(path), which is element.iterfind() 661 662 *path* is a string having either an element tag or an XPath, 663 *namespaces* is an optional mapping from namespace prefix to full name. 664 665 Return an iterable yielding all matching elements in document order. 666 667 """ 668 # assert self._root is not None 669 if path[:1] == "/": 670 path = "." + path 671 warnings.warn( 672 "This search is broken in 1.3 and earlier, and will be " 673 "fixed in a future version. If you rely on the current " 674 "behaviour, change it to %r" % path, 675 FutureWarning, stacklevel=2 676 ) 677 return self._root.iterfind(path, namespaces) 678 679 def write(self, file_or_filename, 680 encoding=None, 681 xml_declaration=None, 682 default_namespace=None, 683 method=None, *, 684 short_empty_elements=True): 685 """Write element tree to a file as XML. 686 687 Arguments: 688 *file_or_filename* -- file name or a file object opened for writing 689 690 *encoding* -- the output encoding (default: US-ASCII) 691 692 *xml_declaration* -- bool indicating if an XML declaration should be 693 added to the output. If None, an XML declaration 694 is added if encoding IS NOT either of: 695 US-ASCII, UTF-8, or Unicode 696 697 *default_namespace* -- sets the default XML namespace (for "xmlns") 698 699 *method* -- either "xml" (default), "html, "text", or "c14n" 700 701 *short_empty_elements* -- controls the formatting of elements 702 that contain no content. If True (default) 703 they are emitted as a single self-closed 704 tag, otherwise they are emitted as a pair 705 of start/end tags 706 707 """ 708 if not method: 709 method = "xml" 710 elif method not in _serialize: 711 raise ValueError("unknown method %r" % method) 712 if not encoding: 713 if method == "c14n": 714 encoding = "utf-8" 715 else: 716 encoding = "us-ascii" 717 with _get_writer(file_or_filename, encoding) as (write, declared_encoding): 718 if method == "xml" and (xml_declaration or 719 (xml_declaration is None and 720 encoding.lower() != "unicode" and 721 declared_encoding.lower() not in ("utf-8", "us-ascii"))): 722 write("<?xml version='1.0' encoding='%s'?>\n" % ( 723 declared_encoding,)) 724 if method == "text": 725 _serialize_text(write, self._root) 726 else: 727 qnames, namespaces = _namespaces(self._root, default_namespace) 728 serialize = _serialize[method] 729 serialize(write, self._root, qnames, namespaces, 730 short_empty_elements=short_empty_elements) 731 732 def write_c14n(self, file): 733 # lxml.etree compatibility. use output method instead 734 return self.write(file, method="c14n") 735 736# -------------------------------------------------------------------- 737# serialization support 738 739@contextlib.contextmanager 740def _get_writer(file_or_filename, encoding): 741 # returns text write method and release all resources after using 742 try: 743 write = file_or_filename.write 744 except AttributeError: 745 # file_or_filename is a file name 746 if encoding.lower() == "unicode": 747 encoding="utf-8" 748 with open(file_or_filename, "w", encoding=encoding, 749 errors="xmlcharrefreplace") as file: 750 yield file.write, encoding 751 else: 752 # file_or_filename is a file-like object 753 # encoding determines if it is a text or binary writer 754 if encoding.lower() == "unicode": 755 # use a text writer as is 756 yield write, getattr(file_or_filename, "encoding", None) or "utf-8" 757 else: 758 # wrap a binary writer with TextIOWrapper 759 with contextlib.ExitStack() as stack: 760 if isinstance(file_or_filename, io.BufferedIOBase): 761 file = file_or_filename 762 elif isinstance(file_or_filename, io.RawIOBase): 763 file = io.BufferedWriter(file_or_filename) 764 # Keep the original file open when the BufferedWriter is 765 # destroyed 766 stack.callback(file.detach) 767 else: 768 # This is to handle passed objects that aren't in the 769 # IOBase hierarchy, but just have a write method 770 file = io.BufferedIOBase() 771 file.writable = lambda: True 772 file.write = write 773 try: 774 # TextIOWrapper uses this methods to determine 775 # if BOM (for UTF-16, etc) should be added 776 file.seekable = file_or_filename.seekable 777 file.tell = file_or_filename.tell 778 except AttributeError: 779 pass 780 file = io.TextIOWrapper(file, 781 encoding=encoding, 782 errors="xmlcharrefreplace", 783 newline="\n") 784 # Keep the original file open when the TextIOWrapper is 785 # destroyed 786 stack.callback(file.detach) 787 yield file.write, encoding 788 789def _namespaces(elem, default_namespace=None): 790 # identify namespaces used in this tree 791 792 # maps qnames to *encoded* prefix:local names 793 qnames = {None: None} 794 795 # maps uri:s to prefixes 796 namespaces = {} 797 if default_namespace: 798 namespaces[default_namespace] = "" 799 800 def add_qname(qname): 801 # calculate serialized qname representation 802 try: 803 if qname[:1] == "{": 804 uri, tag = qname[1:].rsplit("}", 1) 805 prefix = namespaces.get(uri) 806 if prefix is None: 807 prefix = _namespace_map.get(uri) 808 if prefix is None: 809 prefix = "ns%d" % len(namespaces) 810 if prefix != "xml": 811 namespaces[uri] = prefix 812 if prefix: 813 qnames[qname] = "%s:%s" % (prefix, tag) 814 else: 815 qnames[qname] = tag # default element 816 else: 817 if default_namespace: 818 # FIXME: can this be handled in XML 1.0? 819 raise ValueError( 820 "cannot use non-qualified names with " 821 "default_namespace option" 822 ) 823 qnames[qname] = qname 824 except TypeError: 825 _raise_serialization_error(qname) 826 827 # populate qname and namespaces table 828 for elem in elem.iter(): 829 tag = elem.tag 830 if isinstance(tag, QName): 831 if tag.text not in qnames: 832 add_qname(tag.text) 833 elif isinstance(tag, str): 834 if tag not in qnames: 835 add_qname(tag) 836 elif tag is not None and tag is not Comment and tag is not PI: 837 _raise_serialization_error(tag) 838 for key, value in elem.items(): 839 if isinstance(key, QName): 840 key = key.text 841 if key not in qnames: 842 add_qname(key) 843 if isinstance(value, QName) and value.text not in qnames: 844 add_qname(value.text) 845 text = elem.text 846 if isinstance(text, QName) and text.text not in qnames: 847 add_qname(text.text) 848 return qnames, namespaces 849 850def _serialize_xml(write, elem, qnames, namespaces, 851 short_empty_elements, **kwargs): 852 tag = elem.tag 853 text = elem.text 854 if tag is Comment: 855 write("<!--%s-->" % text) 856 elif tag is ProcessingInstruction: 857 write("<?%s?>" % text) 858 else: 859 tag = qnames[tag] 860 if tag is None: 861 if text: 862 write(_escape_cdata(text)) 863 for e in elem: 864 _serialize_xml(write, e, qnames, None, 865 short_empty_elements=short_empty_elements) 866 else: 867 write("<" + tag) 868 items = list(elem.items()) 869 if items or namespaces: 870 if namespaces: 871 for v, k in sorted(namespaces.items(), 872 key=lambda x: x[1]): # sort on prefix 873 if k: 874 k = ":" + k 875 write(" xmlns%s=\"%s\"" % ( 876 k, 877 _escape_attrib(v) 878 )) 879 for k, v in items: 880 if isinstance(k, QName): 881 k = k.text 882 if isinstance(v, QName): 883 v = qnames[v.text] 884 else: 885 v = _escape_attrib(v) 886 write(" %s=\"%s\"" % (qnames[k], v)) 887 if text or len(elem) or not short_empty_elements: 888 write(">") 889 if text: 890 write(_escape_cdata(text)) 891 for e in elem: 892 _serialize_xml(write, e, qnames, None, 893 short_empty_elements=short_empty_elements) 894 write("</" + tag + ">") 895 else: 896 write(" />") 897 if elem.tail: 898 write(_escape_cdata(elem.tail)) 899 900HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", 901 "img", "input", "isindex", "link", "meta", "param", "source", 902 "track", "wbr"} 903 904def _serialize_html(write, elem, qnames, namespaces, **kwargs): 905 tag = elem.tag 906 text = elem.text 907 if tag is Comment: 908 write("<!--%s-->" % _escape_cdata(text)) 909 elif tag is ProcessingInstruction: 910 write("<?%s?>" % _escape_cdata(text)) 911 else: 912 tag = qnames[tag] 913 if tag is None: 914 if text: 915 write(_escape_cdata(text)) 916 for e in elem: 917 _serialize_html(write, e, qnames, None) 918 else: 919 write("<" + tag) 920 items = list(elem.items()) 921 if items or namespaces: 922 if namespaces: 923 for v, k in sorted(namespaces.items(), 924 key=lambda x: x[1]): # sort on prefix 925 if k: 926 k = ":" + k 927 write(" xmlns%s=\"%s\"" % ( 928 k, 929 _escape_attrib(v) 930 )) 931 for k, v in items: 932 if isinstance(k, QName): 933 k = k.text 934 if isinstance(v, QName): 935 v = qnames[v.text] 936 else: 937 v = _escape_attrib_html(v) 938 # FIXME: handle boolean attributes 939 write(" %s=\"%s\"" % (qnames[k], v)) 940 write(">") 941 ltag = tag.lower() 942 if text: 943 if ltag == "script" or ltag == "style": 944 write(text) 945 else: 946 write(_escape_cdata(text)) 947 for e in elem: 948 _serialize_html(write, e, qnames, None) 949 if ltag not in HTML_EMPTY: 950 write("</" + tag + ">") 951 if elem.tail: 952 write(_escape_cdata(elem.tail)) 953 954def _serialize_text(write, elem): 955 for part in elem.itertext(): 956 write(part) 957 if elem.tail: 958 write(elem.tail) 959 960_serialize = { 961 "xml": _serialize_xml, 962 "html": _serialize_html, 963 "text": _serialize_text, 964# this optional method is imported at the end of the module 965# "c14n": _serialize_c14n, 966} 967 968 969def register_namespace(prefix, uri): 970 """Register a namespace prefix. 971 972 The registry is global, and any existing mapping for either the 973 given prefix or the namespace URI will be removed. 974 975 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and 976 attributes in this namespace will be serialized with prefix if possible. 977 978 ValueError is raised if prefix is reserved or is invalid. 979 980 """ 981 if re.match(r"ns\d+$", prefix): 982 raise ValueError("Prefix format reserved for internal use") 983 for k, v in list(_namespace_map.items()): 984 if k == uri or v == prefix: 985 del _namespace_map[k] 986 _namespace_map[uri] = prefix 987 988_namespace_map = { 989 # "well-known" namespace prefixes 990 "http://www.w3.org/XML/1998/namespace": "xml", 991 "http://www.w3.org/1999/xhtml": "html", 992 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 993 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 994 # xml schema 995 "http://www.w3.org/2001/XMLSchema": "xs", 996 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 997 # dublin core 998 "http://purl.org/dc/elements/1.1/": "dc", 999} 1000# For tests and troubleshooting 1001register_namespace._namespace_map = _namespace_map 1002 1003def _raise_serialization_error(text): 1004 raise TypeError( 1005 "cannot serialize %r (type %s)" % (text, type(text).__name__) 1006 ) 1007 1008def _escape_cdata(text): 1009 # escape character data 1010 try: 1011 # it's worth avoiding do-nothing calls for strings that are 1012 # shorter than 500 characters, or so. assume that's, by far, 1013 # the most common case in most applications. 1014 if "&" in text: 1015 text = text.replace("&", "&") 1016 if "<" in text: 1017 text = text.replace("<", "<") 1018 if ">" in text: 1019 text = text.replace(">", ">") 1020 return text 1021 except (TypeError, AttributeError): 1022 _raise_serialization_error(text) 1023 1024def _escape_attrib(text): 1025 # escape attribute value 1026 try: 1027 if "&" in text: 1028 text = text.replace("&", "&") 1029 if "<" in text: 1030 text = text.replace("<", "<") 1031 if ">" in text: 1032 text = text.replace(">", ">") 1033 if "\"" in text: 1034 text = text.replace("\"", """) 1035 # Although section 2.11 of the XML specification states that CR or 1036 # CR LN should be replaced with just LN, it applies only to EOLNs 1037 # which take part of organizing file into lines. Within attributes, 1038 # we are replacing these with entity numbers, so they do not count. 1039 # http://www.w3.org/TR/REC-xml/#sec-line-ends 1040 # The current solution, contained in following six lines, was 1041 # discussed in issue 17582 and 39011. 1042 if "\r" in text: 1043 text = text.replace("\r", " ") 1044 if "\n" in text: 1045 text = text.replace("\n", " ") 1046 if "\t" in text: 1047 text = text.replace("\t", "	") 1048 return text 1049 except (TypeError, AttributeError): 1050 _raise_serialization_error(text) 1051 1052def _escape_attrib_html(text): 1053 # escape attribute value 1054 try: 1055 if "&" in text: 1056 text = text.replace("&", "&") 1057 if ">" in text: 1058 text = text.replace(">", ">") 1059 if "\"" in text: 1060 text = text.replace("\"", """) 1061 return text 1062 except (TypeError, AttributeError): 1063 _raise_serialization_error(text) 1064 1065# -------------------------------------------------------------------- 1066 1067def tostring(element, encoding=None, method=None, *, 1068 xml_declaration=None, default_namespace=None, 1069 short_empty_elements=True): 1070 """Generate string representation of XML element. 1071 1072 All subelements are included. If encoding is "unicode", a string 1073 is returned. Otherwise a bytestring is returned. 1074 1075 *element* is an Element instance, *encoding* is an optional output 1076 encoding defaulting to US-ASCII, *method* is an optional output which can 1077 be one of "xml" (default), "html", "text" or "c14n", *default_namespace* 1078 sets the default XML namespace (for "xmlns"). 1079 1080 Returns an (optionally) encoded string containing the XML data. 1081 1082 """ 1083 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() 1084 ElementTree(element).write(stream, encoding, 1085 xml_declaration=xml_declaration, 1086 default_namespace=default_namespace, 1087 method=method, 1088 short_empty_elements=short_empty_elements) 1089 return stream.getvalue() 1090 1091class _ListDataStream(io.BufferedIOBase): 1092 """An auxiliary stream accumulating into a list reference.""" 1093 def __init__(self, lst): 1094 self.lst = lst 1095 1096 def writable(self): 1097 return True 1098 1099 def seekable(self): 1100 return True 1101 1102 def write(self, b): 1103 self.lst.append(b) 1104 1105 def tell(self): 1106 return len(self.lst) 1107 1108def tostringlist(element, encoding=None, method=None, *, 1109 xml_declaration=None, default_namespace=None, 1110 short_empty_elements=True): 1111 lst = [] 1112 stream = _ListDataStream(lst) 1113 ElementTree(element).write(stream, encoding, 1114 xml_declaration=xml_declaration, 1115 default_namespace=default_namespace, 1116 method=method, 1117 short_empty_elements=short_empty_elements) 1118 return lst 1119 1120 1121def dump(elem): 1122 """Write element tree or element structure to sys.stdout. 1123 1124 This function should be used for debugging only. 1125 1126 *elem* is either an ElementTree, or a single Element. The exact output 1127 format is implementation dependent. In this version, it's written as an 1128 ordinary XML file. 1129 1130 """ 1131 # debugging 1132 if not isinstance(elem, ElementTree): 1133 elem = ElementTree(elem) 1134 elem.write(sys.stdout, encoding="unicode") 1135 tail = elem.getroot().tail 1136 if not tail or tail[-1] != "\n": 1137 sys.stdout.write("\n") 1138 1139 1140def indent(tree, space=" ", level=0): 1141 """Indent an XML document by inserting newlines and indentation space 1142 after elements. 1143 1144 *tree* is the ElementTree or Element to modify. The (root) element 1145 itself will not be changed, but the tail text of all elements in its 1146 subtree will be adapted. 1147 1148 *space* is the whitespace to insert for each indentation level, two 1149 space characters by default. 1150 1151 *level* is the initial indentation level. Setting this to a higher 1152 value than 0 can be used for indenting subtrees that are more deeply 1153 nested inside of a document. 1154 """ 1155 if isinstance(tree, ElementTree): 1156 tree = tree.getroot() 1157 if level < 0: 1158 raise ValueError(f"Initial indentation level must be >= 0, got {level}") 1159 if not len(tree): 1160 return 1161 1162 # Reduce the memory consumption by reusing indentation strings. 1163 indentations = ["\n" + level * space] 1164 1165 def _indent_children(elem, level): 1166 # Start a new indentation level for the first child. 1167 child_level = level + 1 1168 try: 1169 child_indentation = indentations[child_level] 1170 except IndexError: 1171 child_indentation = indentations[level] + space 1172 indentations.append(child_indentation) 1173 1174 if not elem.text or not elem.text.strip(): 1175 elem.text = child_indentation 1176 1177 for child in elem: 1178 if len(child): 1179 _indent_children(child, child_level) 1180 if not child.tail or not child.tail.strip(): 1181 child.tail = child_indentation 1182 1183 # Dedent after the last child by overwriting the previous indentation. 1184 if not child.tail.strip(): 1185 child.tail = indentations[level] 1186 1187 _indent_children(tree, 0) 1188 1189 1190# -------------------------------------------------------------------- 1191# parsing 1192 1193 1194def parse(source, parser=None): 1195 """Parse XML document into element tree. 1196 1197 *source* is a filename or file object containing XML data, 1198 *parser* is an optional parser instance defaulting to XMLParser. 1199 1200 Return an ElementTree instance. 1201 1202 """ 1203 tree = ElementTree() 1204 tree.parse(source, parser) 1205 return tree 1206 1207 1208def iterparse(source, events=None, parser=None): 1209 """Incrementally parse XML document into ElementTree. 1210 1211 This class also reports what's going on to the user based on the 1212 *events* it is initialized with. The supported events are the strings 1213 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get 1214 detailed namespace information). If *events* is omitted, only 1215 "end" events are reported. 1216 1217 *source* is a filename or file object containing XML data, *events* is 1218 a list of events to report back, *parser* is an optional parser instance. 1219 1220 Returns an iterator providing (event, elem) pairs. 1221 1222 """ 1223 # Use the internal, undocumented _parser argument for now; When the 1224 # parser argument of iterparse is removed, this can be killed. 1225 pullparser = XMLPullParser(events=events, _parser=parser) 1226 1227 if not hasattr(source, "read"): 1228 source = open(source, "rb") 1229 close_source = True 1230 else: 1231 close_source = False 1232 1233 def iterator(source): 1234 try: 1235 while True: 1236 yield from pullparser.read_events() 1237 # load event buffer 1238 data = source.read(16 * 1024) 1239 if not data: 1240 break 1241 pullparser.feed(data) 1242 root = pullparser._close_and_return_root() 1243 yield from pullparser.read_events() 1244 it = wr() 1245 if it is not None: 1246 it.root = root 1247 finally: 1248 if close_source: 1249 source.close() 1250 1251 gen = iterator(source) 1252 class IterParseIterator(collections.abc.Iterator): 1253 __next__ = gen.__next__ 1254 def close(self): 1255 if close_source: 1256 source.close() 1257 gen.close() 1258 1259 def __del__(self): 1260 # TODO: Emit a ResourceWarning if it was not explicitly closed. 1261 # (When the close() method will be supported in all maintained Python versions.) 1262 if close_source: 1263 source.close() 1264 1265 it = IterParseIterator() 1266 it.root = None 1267 wr = weakref.ref(it) 1268 return it 1269 1270 1271class XMLPullParser: 1272 1273 def __init__(self, events=None, *, _parser=None): 1274 # The _parser argument is for internal use only and must not be relied 1275 # upon in user code. It will be removed in a future release. 1276 # See https://bugs.python.org/issue17741 for more details. 1277 1278 self._events_queue = collections.deque() 1279 self._parser = _parser or XMLParser(target=TreeBuilder()) 1280 # wire up the parser for event reporting 1281 if events is None: 1282 events = ("end",) 1283 self._parser._setevents(self._events_queue, events) 1284 1285 def feed(self, data): 1286 """Feed encoded data to parser.""" 1287 if self._parser is None: 1288 raise ValueError("feed() called after end of stream") 1289 if data: 1290 try: 1291 self._parser.feed(data) 1292 except SyntaxError as exc: 1293 self._events_queue.append(exc) 1294 1295 def _close_and_return_root(self): 1296 # iterparse needs this to set its root attribute properly :( 1297 root = self._parser.close() 1298 self._parser = None 1299 return root 1300 1301 def close(self): 1302 """Finish feeding data to parser. 1303 1304 Unlike XMLParser, does not return the root element. Use 1305 read_events() to consume elements from XMLPullParser. 1306 """ 1307 self._close_and_return_root() 1308 1309 def read_events(self): 1310 """Return an iterator over currently available (event, elem) pairs. 1311 1312 Events are consumed from the internal event queue as they are 1313 retrieved from the iterator. 1314 """ 1315 events = self._events_queue 1316 while events: 1317 event = events.popleft() 1318 if isinstance(event, Exception): 1319 raise event 1320 else: 1321 yield event 1322 1323 def flush(self): 1324 if self._parser is None: 1325 raise ValueError("flush() called after end of stream") 1326 self._parser.flush() 1327 1328 1329def XML(text, parser=None): 1330 """Parse XML document from string constant. 1331 1332 This function can be used to embed "XML Literals" in Python code. 1333 1334 *text* is a string containing XML data, *parser* is an 1335 optional parser instance, defaulting to the standard XMLParser. 1336 1337 Returns an Element instance. 1338 1339 """ 1340 if not parser: 1341 parser = XMLParser(target=TreeBuilder()) 1342 parser.feed(text) 1343 return parser.close() 1344 1345 1346def XMLID(text, parser=None): 1347 """Parse XML document from string constant for its IDs. 1348 1349 *text* is a string containing XML data, *parser* is an 1350 optional parser instance, defaulting to the standard XMLParser. 1351 1352 Returns an (Element, dict) tuple, in which the 1353 dict maps element id:s to elements. 1354 1355 """ 1356 if not parser: 1357 parser = XMLParser(target=TreeBuilder()) 1358 parser.feed(text) 1359 tree = parser.close() 1360 ids = {} 1361 for elem in tree.iter(): 1362 id = elem.get("id") 1363 if id: 1364 ids[id] = elem 1365 return tree, ids 1366 1367# Parse XML document from string constant. Alias for XML(). 1368fromstring = XML 1369 1370def fromstringlist(sequence, parser=None): 1371 """Parse XML document from sequence of string fragments. 1372 1373 *sequence* is a list of other sequence, *parser* is an optional parser 1374 instance, defaulting to the standard XMLParser. 1375 1376 Returns an Element instance. 1377 1378 """ 1379 if not parser: 1380 parser = XMLParser(target=TreeBuilder()) 1381 for text in sequence: 1382 parser.feed(text) 1383 return parser.close() 1384 1385# -------------------------------------------------------------------- 1386 1387 1388class TreeBuilder: 1389 """Generic element structure builder. 1390 1391 This builder converts a sequence of start, data, and end method 1392 calls to a well-formed element structure. 1393 1394 You can use this class to build an element structure using a custom XML 1395 parser, or a parser for some other XML-like format. 1396 1397 *element_factory* is an optional element factory which is called 1398 to create new Element instances, as necessary. 1399 1400 *comment_factory* is a factory to create comments to be used instead of 1401 the standard factory. If *insert_comments* is false (the default), 1402 comments will not be inserted into the tree. 1403 1404 *pi_factory* is a factory to create processing instructions to be used 1405 instead of the standard factory. If *insert_pis* is false (the default), 1406 processing instructions will not be inserted into the tree. 1407 """ 1408 def __init__(self, element_factory=None, *, 1409 comment_factory=None, pi_factory=None, 1410 insert_comments=False, insert_pis=False): 1411 self._data = [] # data collector 1412 self._elem = [] # element stack 1413 self._last = None # last element 1414 self._root = None # root element 1415 self._tail = None # true if we're after an end tag 1416 if comment_factory is None: 1417 comment_factory = Comment 1418 self._comment_factory = comment_factory 1419 self.insert_comments = insert_comments 1420 if pi_factory is None: 1421 pi_factory = ProcessingInstruction 1422 self._pi_factory = pi_factory 1423 self.insert_pis = insert_pis 1424 if element_factory is None: 1425 element_factory = Element 1426 self._factory = element_factory 1427 1428 def close(self): 1429 """Flush builder buffers and return toplevel document Element.""" 1430 assert len(self._elem) == 0, "missing end tags" 1431 assert self._root is not None, "missing toplevel element" 1432 return self._root 1433 1434 def _flush(self): 1435 if self._data: 1436 if self._last is not None: 1437 text = "".join(self._data) 1438 if self._tail: 1439 assert self._last.tail is None, "internal error (tail)" 1440 self._last.tail = text 1441 else: 1442 assert self._last.text is None, "internal error (text)" 1443 self._last.text = text 1444 self._data = [] 1445 1446 def data(self, data): 1447 """Add text to current element.""" 1448 self._data.append(data) 1449 1450 def start(self, tag, attrs): 1451 """Open new element and return it. 1452 1453 *tag* is the element name, *attrs* is a dict containing element 1454 attributes. 1455 1456 """ 1457 self._flush() 1458 self._last = elem = self._factory(tag, attrs) 1459 if self._elem: 1460 self._elem[-1].append(elem) 1461 elif self._root is None: 1462 self._root = elem 1463 self._elem.append(elem) 1464 self._tail = 0 1465 return elem 1466 1467 def end(self, tag): 1468 """Close and return current Element. 1469 1470 *tag* is the element name. 1471 1472 """ 1473 self._flush() 1474 self._last = self._elem.pop() 1475 assert self._last.tag == tag,\ 1476 "end tag mismatch (expected %s, got %s)" % ( 1477 self._last.tag, tag) 1478 self._tail = 1 1479 return self._last 1480 1481 def comment(self, text): 1482 """Create a comment using the comment_factory. 1483 1484 *text* is the text of the comment. 1485 """ 1486 return self._handle_single( 1487 self._comment_factory, self.insert_comments, text) 1488 1489 def pi(self, target, text=None): 1490 """Create a processing instruction using the pi_factory. 1491 1492 *target* is the target name of the processing instruction. 1493 *text* is the data of the processing instruction, or ''. 1494 """ 1495 return self._handle_single( 1496 self._pi_factory, self.insert_pis, target, text) 1497 1498 def _handle_single(self, factory, insert, *args): 1499 elem = factory(*args) 1500 if insert: 1501 self._flush() 1502 self._last = elem 1503 if self._elem: 1504 self._elem[-1].append(elem) 1505 self._tail = 1 1506 return elem 1507 1508 1509# also see ElementTree and TreeBuilder 1510class XMLParser: 1511 """Element structure builder for XML source data based on the expat parser. 1512 1513 *target* is an optional target object which defaults to an instance of the 1514 standard TreeBuilder class, *encoding* is an optional encoding string 1515 which if given, overrides the encoding specified in the XML file: 1516 http://www.iana.org/assignments/character-sets 1517 1518 """ 1519 1520 def __init__(self, *, target=None, encoding=None): 1521 try: 1522 from xml.parsers import expat 1523 except ImportError: 1524 try: 1525 import pyexpat as expat 1526 except ImportError: 1527 raise ImportError( 1528 "No module named expat; use SimpleXMLTreeBuilder instead" 1529 ) 1530 parser = expat.ParserCreate(encoding, "}") 1531 if target is None: 1532 target = TreeBuilder() 1533 # underscored names are provided for compatibility only 1534 self.parser = self._parser = parser 1535 self.target = self._target = target 1536 self._error = expat.error 1537 self._names = {} # name memo cache 1538 # main callbacks 1539 parser.DefaultHandlerExpand = self._default 1540 if hasattr(target, 'start'): 1541 parser.StartElementHandler = self._start 1542 if hasattr(target, 'end'): 1543 parser.EndElementHandler = self._end 1544 if hasattr(target, 'start_ns'): 1545 parser.StartNamespaceDeclHandler = self._start_ns 1546 if hasattr(target, 'end_ns'): 1547 parser.EndNamespaceDeclHandler = self._end_ns 1548 if hasattr(target, 'data'): 1549 parser.CharacterDataHandler = target.data 1550 # miscellaneous callbacks 1551 if hasattr(target, 'comment'): 1552 parser.CommentHandler = target.comment 1553 if hasattr(target, 'pi'): 1554 parser.ProcessingInstructionHandler = target.pi 1555 # Configure pyexpat: buffering, new-style attribute handling. 1556 parser.buffer_text = 1 1557 parser.ordered_attributes = 1 1558 self._doctype = None 1559 self.entity = {} 1560 try: 1561 self.version = "Expat %d.%d.%d" % expat.version_info 1562 except AttributeError: 1563 pass # unknown 1564 1565 def _setevents(self, events_queue, events_to_report): 1566 # Internal API for XMLPullParser 1567 # events_to_report: a list of events to report during parsing (same as 1568 # the *events* of XMLPullParser's constructor. 1569 # events_queue: a list of actual parsing events that will be populated 1570 # by the underlying parser. 1571 # 1572 parser = self._parser 1573 append = events_queue.append 1574 for event_name in events_to_report: 1575 if event_name == "start": 1576 parser.ordered_attributes = 1 1577 def handler(tag, attrib_in, event=event_name, append=append, 1578 start=self._start): 1579 append((event, start(tag, attrib_in))) 1580 parser.StartElementHandler = handler 1581 elif event_name == "end": 1582 def handler(tag, event=event_name, append=append, 1583 end=self._end): 1584 append((event, end(tag))) 1585 parser.EndElementHandler = handler 1586 elif event_name == "start-ns": 1587 # TreeBuilder does not implement .start_ns() 1588 if hasattr(self.target, "start_ns"): 1589 def handler(prefix, uri, event=event_name, append=append, 1590 start_ns=self._start_ns): 1591 append((event, start_ns(prefix, uri))) 1592 else: 1593 def handler(prefix, uri, event=event_name, append=append): 1594 append((event, (prefix or '', uri or ''))) 1595 parser.StartNamespaceDeclHandler = handler 1596 elif event_name == "end-ns": 1597 # TreeBuilder does not implement .end_ns() 1598 if hasattr(self.target, "end_ns"): 1599 def handler(prefix, event=event_name, append=append, 1600 end_ns=self._end_ns): 1601 append((event, end_ns(prefix))) 1602 else: 1603 def handler(prefix, event=event_name, append=append): 1604 append((event, None)) 1605 parser.EndNamespaceDeclHandler = handler 1606 elif event_name == 'comment': 1607 def handler(text, event=event_name, append=append, self=self): 1608 append((event, self.target.comment(text))) 1609 parser.CommentHandler = handler 1610 elif event_name == 'pi': 1611 def handler(pi_target, data, event=event_name, append=append, 1612 self=self): 1613 append((event, self.target.pi(pi_target, data))) 1614 parser.ProcessingInstructionHandler = handler 1615 else: 1616 raise ValueError("unknown event %r" % event_name) 1617 1618 def _raiseerror(self, value): 1619 err = ParseError(value) 1620 err.code = value.code 1621 err.position = value.lineno, value.offset 1622 raise err 1623 1624 def _fixname(self, key): 1625 # expand qname, and convert name string to ascii, if possible 1626 try: 1627 name = self._names[key] 1628 except KeyError: 1629 name = key 1630 if "}" in name: 1631 name = "{" + name 1632 self._names[key] = name 1633 return name 1634 1635 def _start_ns(self, prefix, uri): 1636 return self.target.start_ns(prefix or '', uri or '') 1637 1638 def _end_ns(self, prefix): 1639 return self.target.end_ns(prefix or '') 1640 1641 def _start(self, tag, attr_list): 1642 # Handler for expat's StartElementHandler. Since ordered_attributes 1643 # is set, the attributes are reported as a list of alternating 1644 # attribute name,value. 1645 fixname = self._fixname 1646 tag = fixname(tag) 1647 attrib = {} 1648 if attr_list: 1649 for i in range(0, len(attr_list), 2): 1650 attrib[fixname(attr_list[i])] = attr_list[i+1] 1651 return self.target.start(tag, attrib) 1652 1653 def _end(self, tag): 1654 return self.target.end(self._fixname(tag)) 1655 1656 def _default(self, text): 1657 prefix = text[:1] 1658 if prefix == "&": 1659 # deal with undefined entities 1660 try: 1661 data_handler = self.target.data 1662 except AttributeError: 1663 return 1664 try: 1665 data_handler(self.entity[text[1:-1]]) 1666 except KeyError: 1667 from xml.parsers import expat 1668 err = expat.error( 1669 "undefined entity %s: line %d, column %d" % 1670 (text, self.parser.ErrorLineNumber, 1671 self.parser.ErrorColumnNumber) 1672 ) 1673 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1674 err.lineno = self.parser.ErrorLineNumber 1675 err.offset = self.parser.ErrorColumnNumber 1676 raise err 1677 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1678 self._doctype = [] # inside a doctype declaration 1679 elif self._doctype is not None: 1680 # parse doctype contents 1681 if prefix == ">": 1682 self._doctype = None 1683 return 1684 text = text.strip() 1685 if not text: 1686 return 1687 self._doctype.append(text) 1688 n = len(self._doctype) 1689 if n > 2: 1690 type = self._doctype[1] 1691 if type == "PUBLIC" and n == 4: 1692 name, type, pubid, system = self._doctype 1693 if pubid: 1694 pubid = pubid[1:-1] 1695 elif type == "SYSTEM" and n == 3: 1696 name, type, system = self._doctype 1697 pubid = None 1698 else: 1699 return 1700 if hasattr(self.target, "doctype"): 1701 self.target.doctype(name, pubid, system[1:-1]) 1702 elif hasattr(self, "doctype"): 1703 warnings.warn( 1704 "The doctype() method of XMLParser is ignored. " 1705 "Define doctype() method on the TreeBuilder target.", 1706 RuntimeWarning) 1707 1708 self._doctype = None 1709 1710 def feed(self, data): 1711 """Feed encoded data to parser.""" 1712 try: 1713 self.parser.Parse(data, False) 1714 except self._error as v: 1715 self._raiseerror(v) 1716 1717 def close(self): 1718 """Finish feeding data to parser and return element structure.""" 1719 try: 1720 self.parser.Parse(b"", True) # end of data 1721 except self._error as v: 1722 self._raiseerror(v) 1723 try: 1724 close_handler = self.target.close 1725 except AttributeError: 1726 pass 1727 else: 1728 return close_handler() 1729 finally: 1730 # get rid of circular references 1731 del self.parser, self._parser 1732 del self.target, self._target 1733 1734 def flush(self): 1735 was_enabled = self.parser.GetReparseDeferralEnabled() 1736 try: 1737 self.parser.SetReparseDeferralEnabled(False) 1738 self.parser.Parse(b"", False) 1739 except self._error as v: 1740 self._raiseerror(v) 1741 finally: 1742 self.parser.SetReparseDeferralEnabled(was_enabled) 1743 1744# -------------------------------------------------------------------- 1745# C14N 2.0 1746 1747def canonicalize(xml_data=None, *, out=None, from_file=None, **options): 1748 """Convert XML to its C14N 2.0 serialised form. 1749 1750 If *out* is provided, it must be a file or file-like object that receives 1751 the serialised canonical XML output (text, not bytes) through its ``.write()`` 1752 method. To write to a file, open it in text mode with encoding "utf-8". 1753 If *out* is not provided, this function returns the output as text string. 1754 1755 Either *xml_data* (an XML string) or *from_file* (a file path or 1756 file-like object) must be provided as input. 1757 1758 The configuration options are the same as for the ``C14NWriterTarget``. 1759 """ 1760 if xml_data is None and from_file is None: 1761 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") 1762 sio = None 1763 if out is None: 1764 sio = out = io.StringIO() 1765 1766 parser = XMLParser(target=C14NWriterTarget(out.write, **options)) 1767 1768 if xml_data is not None: 1769 parser.feed(xml_data) 1770 parser.close() 1771 elif from_file is not None: 1772 parse(from_file, parser=parser) 1773 1774 return sio.getvalue() if sio is not None else None 1775 1776 1777_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match 1778 1779 1780class C14NWriterTarget: 1781 """ 1782 Canonicalization writer target for the XMLParser. 1783 1784 Serialises parse events to XML C14N 2.0. 1785 1786 The *write* function is used for writing out the resulting data stream 1787 as text (not bytes). To write to a file, open it in text mode with encoding 1788 "utf-8" and pass its ``.write`` method. 1789 1790 Configuration options: 1791 1792 - *with_comments*: set to true to include comments 1793 - *strip_text*: set to true to strip whitespace before and after text content 1794 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" 1795 - *qname_aware_tags*: a set of qname aware tag names in which prefixes 1796 should be replaced in text content 1797 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes 1798 should be replaced in text content 1799 - *exclude_attrs*: a set of attribute names that should not be serialised 1800 - *exclude_tags*: a set of tag names that should not be serialised 1801 """ 1802 def __init__(self, write, *, 1803 with_comments=False, strip_text=False, rewrite_prefixes=False, 1804 qname_aware_tags=None, qname_aware_attrs=None, 1805 exclude_attrs=None, exclude_tags=None): 1806 self._write = write 1807 self._data = [] 1808 self._with_comments = with_comments 1809 self._strip_text = strip_text 1810 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None 1811 self._exclude_tags = set(exclude_tags) if exclude_tags else None 1812 1813 self._rewrite_prefixes = rewrite_prefixes 1814 if qname_aware_tags: 1815 self._qname_aware_tags = set(qname_aware_tags) 1816 else: 1817 self._qname_aware_tags = None 1818 if qname_aware_attrs: 1819 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection 1820 else: 1821 self._find_qname_aware_attrs = None 1822 1823 # Stack with globally and newly declared namespaces as (uri, prefix) pairs. 1824 self._declared_ns_stack = [[ 1825 ("http://www.w3.org/XML/1998/namespace", "xml"), 1826 ]] 1827 # Stack with user declared namespace prefixes as (uri, prefix) pairs. 1828 self._ns_stack = [] 1829 if not rewrite_prefixes: 1830 self._ns_stack.append(list(_namespace_map.items())) 1831 self._ns_stack.append([]) 1832 self._prefix_map = {} 1833 self._preserve_space = [False] 1834 self._pending_start = None 1835 self._root_seen = False 1836 self._root_done = False 1837 self._ignored_depth = 0 1838 1839 def _iter_namespaces(self, ns_stack, _reversed=reversed): 1840 for namespaces in _reversed(ns_stack): 1841 if namespaces: # almost no element declares new namespaces 1842 yield from namespaces 1843 1844 def _resolve_prefix_name(self, prefixed_name): 1845 prefix, name = prefixed_name.split(':', 1) 1846 for uri, p in self._iter_namespaces(self._ns_stack): 1847 if p == prefix: 1848 return f'{{{uri}}}{name}' 1849 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') 1850 1851 def _qname(self, qname, uri=None): 1852 if uri is None: 1853 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) 1854 else: 1855 tag = qname 1856 1857 prefixes_seen = set() 1858 for u, prefix in self._iter_namespaces(self._declared_ns_stack): 1859 if u == uri and prefix not in prefixes_seen: 1860 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1861 prefixes_seen.add(prefix) 1862 1863 # Not declared yet => add new declaration. 1864 if self._rewrite_prefixes: 1865 if uri in self._prefix_map: 1866 prefix = self._prefix_map[uri] 1867 else: 1868 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' 1869 self._declared_ns_stack[-1].append((uri, prefix)) 1870 return f'{prefix}:{tag}', tag, uri 1871 1872 if not uri and '' not in prefixes_seen: 1873 # No default namespace declared => no prefix needed. 1874 return tag, tag, uri 1875 1876 for u, prefix in self._iter_namespaces(self._ns_stack): 1877 if u == uri: 1878 self._declared_ns_stack[-1].append((uri, prefix)) 1879 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1880 1881 if not uri: 1882 # As soon as a default namespace is defined, 1883 # anything that has no namespace (and thus, no prefix) goes there. 1884 return tag, tag, uri 1885 1886 raise ValueError(f'Namespace "{uri}" is not declared in scope') 1887 1888 def data(self, data): 1889 if not self._ignored_depth: 1890 self._data.append(data) 1891 1892 def _flush(self, _join_text=''.join): 1893 data = _join_text(self._data) 1894 del self._data[:] 1895 if self._strip_text and not self._preserve_space[-1]: 1896 data = data.strip() 1897 if self._pending_start is not None: 1898 args, self._pending_start = self._pending_start, None 1899 qname_text = data if data and _looks_like_prefix_name(data) else None 1900 self._start(*args, qname_text) 1901 if qname_text is not None: 1902 return 1903 if data and self._root_seen: 1904 self._write(_escape_cdata_c14n(data)) 1905 1906 def start_ns(self, prefix, uri): 1907 if self._ignored_depth: 1908 return 1909 # we may have to resolve qnames in text content 1910 if self._data: 1911 self._flush() 1912 self._ns_stack[-1].append((uri, prefix)) 1913 1914 def start(self, tag, attrs): 1915 if self._exclude_tags is not None and ( 1916 self._ignored_depth or tag in self._exclude_tags): 1917 self._ignored_depth += 1 1918 return 1919 if self._data: 1920 self._flush() 1921 1922 new_namespaces = [] 1923 self._declared_ns_stack.append(new_namespaces) 1924 1925 if self._qname_aware_tags is not None and tag in self._qname_aware_tags: 1926 # Need to parse text first to see if it requires a prefix declaration. 1927 self._pending_start = (tag, attrs, new_namespaces) 1928 return 1929 self._start(tag, attrs, new_namespaces) 1930 1931 def _start(self, tag, attrs, new_namespaces, qname_text=None): 1932 if self._exclude_attrs is not None and attrs: 1933 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} 1934 1935 qnames = {tag, *attrs} 1936 resolved_names = {} 1937 1938 # Resolve prefixes in attribute and tag text. 1939 if qname_text is not None: 1940 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) 1941 qnames.add(qname) 1942 if self._find_qname_aware_attrs is not None and attrs: 1943 qattrs = self._find_qname_aware_attrs(attrs) 1944 if qattrs: 1945 for attr_name in qattrs: 1946 value = attrs[attr_name] 1947 if _looks_like_prefix_name(value): 1948 qname = resolved_names[value] = self._resolve_prefix_name(value) 1949 qnames.add(qname) 1950 else: 1951 qattrs = None 1952 else: 1953 qattrs = None 1954 1955 # Assign prefixes in lexicographical order of used URIs. 1956 parse_qname = self._qname 1957 parsed_qnames = {n: parse_qname(n) for n in sorted( 1958 qnames, key=lambda n: n.split('}', 1))} 1959 1960 # Write namespace declarations in prefix order ... 1961 if new_namespaces: 1962 attr_list = [ 1963 ('xmlns:' + prefix if prefix else 'xmlns', uri) 1964 for uri, prefix in new_namespaces 1965 ] 1966 attr_list.sort() 1967 else: 1968 # almost always empty 1969 attr_list = [] 1970 1971 # ... followed by attributes in URI+name order 1972 if attrs: 1973 for k, v in sorted(attrs.items()): 1974 if qattrs is not None and k in qattrs and v in resolved_names: 1975 v = parsed_qnames[resolved_names[v]][0] 1976 attr_qname, attr_name, uri = parsed_qnames[k] 1977 # No prefix for attributes in default ('') namespace. 1978 attr_list.append((attr_qname if uri else attr_name, v)) 1979 1980 # Honour xml:space attributes. 1981 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') 1982 self._preserve_space.append( 1983 space_behaviour == 'preserve' if space_behaviour 1984 else self._preserve_space[-1]) 1985 1986 # Write the tag. 1987 write = self._write 1988 write('<' + parsed_qnames[tag][0]) 1989 if attr_list: 1990 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) 1991 write('>') 1992 1993 # Write the resolved qname text content. 1994 if qname_text is not None: 1995 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) 1996 1997 self._root_seen = True 1998 self._ns_stack.append([]) 1999 2000 def end(self, tag): 2001 if self._ignored_depth: 2002 self._ignored_depth -= 1 2003 return 2004 if self._data: 2005 self._flush() 2006 self._write(f'</{self._qname(tag)[0]}>') 2007 self._preserve_space.pop() 2008 self._root_done = len(self._preserve_space) == 1 2009 self._declared_ns_stack.pop() 2010 self._ns_stack.pop() 2011 2012 def comment(self, text): 2013 if not self._with_comments: 2014 return 2015 if self._ignored_depth: 2016 return 2017 if self._root_done: 2018 self._write('\n') 2019 elif self._root_seen and self._data: 2020 self._flush() 2021 self._write(f'<!--{_escape_cdata_c14n(text)}-->') 2022 if not self._root_seen: 2023 self._write('\n') 2024 2025 def pi(self, target, data): 2026 if self._ignored_depth: 2027 return 2028 if self._root_done: 2029 self._write('\n') 2030 elif self._root_seen and self._data: 2031 self._flush() 2032 self._write( 2033 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>') 2034 if not self._root_seen: 2035 self._write('\n') 2036 2037 2038def _escape_cdata_c14n(text): 2039 # escape character data 2040 try: 2041 # it's worth avoiding do-nothing calls for strings that are 2042 # shorter than 500 character, or so. assume that's, by far, 2043 # the most common case in most applications. 2044 if '&' in text: 2045 text = text.replace('&', '&') 2046 if '<' in text: 2047 text = text.replace('<', '<') 2048 if '>' in text: 2049 text = text.replace('>', '>') 2050 if '\r' in text: 2051 text = text.replace('\r', '
') 2052 return text 2053 except (TypeError, AttributeError): 2054 _raise_serialization_error(text) 2055 2056 2057def _escape_attrib_c14n(text): 2058 # escape attribute value 2059 try: 2060 if '&' in text: 2061 text = text.replace('&', '&') 2062 if '<' in text: 2063 text = text.replace('<', '<') 2064 if '"' in text: 2065 text = text.replace('"', '"') 2066 if '\t' in text: 2067 text = text.replace('\t', '	') 2068 if '\n' in text: 2069 text = text.replace('\n', '
') 2070 if '\r' in text: 2071 text = text.replace('\r', '
') 2072 return text 2073 except (TypeError, AttributeError): 2074 _raise_serialization_error(text) 2075 2076 2077# -------------------------------------------------------------------- 2078 2079# Import the C accelerators 2080try: 2081 # Element is going to be shadowed by the C implementation. We need to keep 2082 # the Python version of it accessible for some "creative" by external code 2083 # (see tests) 2084 _Element_Py = Element 2085 2086 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories 2087 from _elementtree import * 2088 from _elementtree import _set_factories 2089except ImportError: 2090 pass 2091else: 2092 _set_factories(Comment, ProcessingInstruction) 2093