1"""Shim module exporting the same ElementTree API for lxml and 2xml.etree backends. 3 4When lxml is installed, it is automatically preferred over the built-in 5xml.etree module. 6On Python 2.7, the cElementTree module is preferred over the pure-python 7ElementTree module. 8 9Besides exporting a unified interface, this also defines extra functions 10or subclasses built-in ElementTree classes to add features that are 11only availble in lxml, like OrderedDict for attributes, pretty_print and 12iterwalk. 13""" 14from __future__ import absolute_import, unicode_literals 15from fontTools.misc.py23 import basestring, unicode, tounicode, open 16 17 18XML_DECLARATION = """<?xml version='1.0' encoding='%s'?>""" 19 20__all__ = [ 21 # public symbols 22 "Comment", 23 "dump", 24 "Element", 25 "ElementTree", 26 "fromstring", 27 "fromstringlist", 28 "iselement", 29 "iterparse", 30 "parse", 31 "ParseError", 32 "PI", 33 "ProcessingInstruction", 34 "QName", 35 "SubElement", 36 "tostring", 37 "tostringlist", 38 "TreeBuilder", 39 "XML", 40 "XMLParser", 41 "register_namespace", 42] 43 44try: 45 from lxml.etree import * 46 47 _have_lxml = True 48except ImportError: 49 try: 50 from xml.etree.cElementTree import * 51 52 # the cElementTree version of XML function doesn't support 53 # the optional 'parser' keyword argument 54 from xml.etree.ElementTree import XML 55 except ImportError: # pragma: no cover 56 from xml.etree.ElementTree import * 57 _have_lxml = False 58 59 import sys 60 61 # dict is always ordered in python >= 3.6 and on pypy 62 PY36 = sys.version_info >= (3, 6) 63 try: 64 import __pypy__ 65 except ImportError: 66 __pypy__ = None 67 _dict_is_ordered = bool(PY36 or __pypy__) 68 del PY36, __pypy__ 69 70 if _dict_is_ordered: 71 _Attrib = dict 72 else: 73 from collections import OrderedDict as _Attrib 74 75 if isinstance(Element, type): 76 _Element = Element 77 else: 78 # in py27, cElementTree.Element cannot be subclassed, so 79 # we need to import the pure-python class 80 from xml.etree.ElementTree import Element as _Element 81 82 class Element(_Element): 83 """Element subclass that keeps the order of attributes.""" 84 85 def __init__(self, tag, attrib=_Attrib(), **extra): 86 super(Element, self).__init__(tag) 87 self.attrib = _Attrib() 88 if attrib: 89 self.attrib.update(attrib) 90 if extra: 91 self.attrib.update(extra) 92 93 def SubElement(parent, tag, attrib=_Attrib(), **extra): 94 """Must override SubElement as well otherwise _elementtree.SubElement 95 fails if 'parent' is a subclass of Element object. 96 """ 97 element = parent.__class__(tag, attrib, **extra) 98 parent.append(element) 99 return element 100 101 def _iterwalk(element, events, tag): 102 include = tag is None or element.tag == tag 103 if include and "start" in events: 104 yield ("start", element) 105 for e in element: 106 for item in _iterwalk(e, events, tag): 107 yield item 108 if include: 109 yield ("end", element) 110 111 def iterwalk(element_or_tree, events=("end",), tag=None): 112 """A tree walker that generates events from an existing tree as 113 if it was parsing XML data with iterparse(). 114 Drop-in replacement for lxml.etree.iterwalk. 115 """ 116 if iselement(element_or_tree): 117 element = element_or_tree 118 else: 119 element = element_or_tree.getroot() 120 if tag == "*": 121 tag = None 122 for item in _iterwalk(element, events, tag): 123 yield item 124 125 _ElementTree = ElementTree 126 127 class ElementTree(_ElementTree): 128 """ElementTree subclass that adds 'pretty_print' and 'doctype' 129 arguments to the 'write' method. 130 Currently these are only supported for the default XML serialization 131 'method', and not also for "html" or "text", for these are delegated 132 to the base class. 133 """ 134 135 def write( 136 self, 137 file_or_filename, 138 encoding=None, 139 xml_declaration=False, 140 method=None, 141 doctype=None, 142 pretty_print=False, 143 ): 144 if method and method != "xml": 145 # delegate to super-class 146 super(ElementTree, self).write( 147 file_or_filename, 148 encoding=encoding, 149 xml_declaration=xml_declaration, 150 method=method, 151 ) 152 return 153 154 if encoding is unicode or ( 155 encoding is not None and encoding.lower() == "unicode" 156 ): 157 if xml_declaration: 158 raise ValueError( 159 "Serialisation to unicode must not request an XML declaration" 160 ) 161 write_declaration = False 162 encoding = "unicode" 163 elif xml_declaration is None: 164 # by default, write an XML declaration only for non-standard encodings 165 write_declaration = encoding is not None and encoding.upper() not in ( 166 "ASCII", 167 "UTF-8", 168 "UTF8", 169 "US-ASCII", 170 ) 171 else: 172 write_declaration = xml_declaration 173 174 if encoding is None: 175 encoding = "ASCII" 176 177 if pretty_print: 178 # NOTE this will modify the tree in-place 179 _indent(self._root) 180 181 with _get_writer(file_or_filename, encoding) as write: 182 if write_declaration: 183 write(XML_DECLARATION % encoding.upper()) 184 if pretty_print: 185 write("\n") 186 if doctype: 187 write(_tounicode(doctype)) 188 if pretty_print: 189 write("\n") 190 191 qnames, namespaces = _namespaces(self._root) 192 _serialize_xml(write, self._root, qnames, namespaces) 193 194 import io 195 196 def tostring( 197 element, 198 encoding=None, 199 xml_declaration=None, 200 method=None, 201 doctype=None, 202 pretty_print=False, 203 ): 204 """Custom 'tostring' function that uses our ElementTree subclass, with 205 pretty_print support. 206 """ 207 stream = io.StringIO() if encoding == "unicode" else io.BytesIO() 208 ElementTree(element).write( 209 stream, 210 encoding=encoding, 211 xml_declaration=xml_declaration, 212 method=method, 213 doctype=doctype, 214 pretty_print=pretty_print, 215 ) 216 return stream.getvalue() 217 218 # serialization support 219 220 import re 221 222 # Valid XML strings can include any Unicode character, excluding control 223 # characters, the surrogate blocks, FFFE, and FFFF: 224 # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 225 # Here we reversed the pattern to match only the invalid characters. 226 # For the 'narrow' python builds supporting only UCS-2, which represent 227 # characters beyond BMP as UTF-16 surrogate pairs, we need to pass through 228 # the surrogate block. I haven't found a more elegant solution... 229 UCS2 = sys.maxunicode < 0x10FFFF 230 if UCS2: 231 _invalid_xml_string = re.compile( 232 "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]" 233 ) 234 else: 235 _invalid_xml_string = re.compile( 236 "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]" 237 ) 238 239 def _tounicode(s): 240 """Test if a string is valid user input and decode it to unicode string 241 using ASCII encoding if it's a bytes string. 242 Reject all bytes/unicode input that contains non-XML characters. 243 Reject all bytes input that contains non-ASCII characters. 244 """ 245 try: 246 s = tounicode(s, encoding="ascii", errors="strict") 247 except UnicodeDecodeError: 248 raise ValueError( 249 "Bytes strings can only contain ASCII characters. " 250 "Use unicode strings for non-ASCII characters.") 251 except AttributeError: 252 _raise_serialization_error(s) 253 if s and _invalid_xml_string.search(s): 254 raise ValueError( 255 "All strings must be XML compatible: Unicode or ASCII, " 256 "no NULL bytes or control characters" 257 ) 258 return s 259 260 import contextlib 261 262 @contextlib.contextmanager 263 def _get_writer(file_or_filename, encoding): 264 # returns text write method and release all resources after using 265 try: 266 write = file_or_filename.write 267 except AttributeError: 268 # file_or_filename is a file name 269 f = open( 270 file_or_filename, 271 "w", 272 encoding="utf-8" if encoding == "unicode" else encoding, 273 errors="xmlcharrefreplace", 274 ) 275 with f: 276 yield f.write 277 else: 278 # file_or_filename is a file-like object 279 # encoding determines if it is a text or binary writer 280 if encoding == "unicode": 281 # use a text writer as is 282 yield write 283 else: 284 # wrap a binary writer with TextIOWrapper 285 detach_buffer = False 286 if isinstance(file_or_filename, io.BufferedIOBase): 287 buf = file_or_filename 288 elif isinstance(file_or_filename, io.RawIOBase): 289 buf = io.BufferedWriter(file_or_filename) 290 detach_buffer = True 291 else: 292 # This is to handle passed objects that aren't in the 293 # IOBase hierarchy, but just have a write method 294 buf = io.BufferedIOBase() 295 buf.writable = lambda: True 296 buf.write = write 297 try: 298 # TextIOWrapper uses this methods to determine 299 # if BOM (for UTF-16, etc) should be added 300 buf.seekable = file_or_filename.seekable 301 buf.tell = file_or_filename.tell 302 except AttributeError: 303 pass 304 wrapper = io.TextIOWrapper( 305 buf, 306 encoding=encoding, 307 errors="xmlcharrefreplace", 308 newline="\n", 309 ) 310 try: 311 yield wrapper.write 312 finally: 313 # Keep the original file open when the TextIOWrapper and 314 # the BufferedWriter are destroyed 315 wrapper.detach() 316 if detach_buffer: 317 buf.detach() 318 319 from xml.etree.ElementTree import _namespace_map 320 321 def _namespaces(elem): 322 # identify namespaces used in this tree 323 324 # maps qnames to *encoded* prefix:local names 325 qnames = {None: None} 326 327 # maps uri:s to prefixes 328 namespaces = {} 329 330 def add_qname(qname): 331 # calculate serialized qname representation 332 try: 333 qname = _tounicode(qname) 334 if qname[:1] == "{": 335 uri, tag = qname[1:].rsplit("}", 1) 336 prefix = namespaces.get(uri) 337 if prefix is None: 338 prefix = _namespace_map.get(uri) 339 if prefix is None: 340 prefix = "ns%d" % len(namespaces) 341 else: 342 prefix = _tounicode(prefix) 343 if prefix != "xml": 344 namespaces[uri] = prefix 345 if prefix: 346 qnames[qname] = "%s:%s" % (prefix, tag) 347 else: 348 qnames[qname] = tag # default element 349 else: 350 qnames[qname] = qname 351 except TypeError: 352 _raise_serialization_error(qname) 353 354 # populate qname and namespaces table 355 for elem in elem.iter(): 356 tag = elem.tag 357 if isinstance(tag, QName): 358 if tag.text not in qnames: 359 add_qname(tag.text) 360 elif isinstance(tag, basestring): 361 if tag not in qnames: 362 add_qname(tag) 363 elif tag is not None and tag is not Comment and tag is not PI: 364 _raise_serialization_error(tag) 365 for key, value in elem.items(): 366 if isinstance(key, QName): 367 key = key.text 368 if key not in qnames: 369 add_qname(key) 370 if isinstance(value, QName) and value.text not in qnames: 371 add_qname(value.text) 372 text = elem.text 373 if isinstance(text, QName) and text.text not in qnames: 374 add_qname(text.text) 375 return qnames, namespaces 376 377 def _serialize_xml(write, elem, qnames, namespaces, **kwargs): 378 tag = elem.tag 379 text = elem.text 380 if tag is Comment: 381 write("<!--%s-->" % _tounicode(text)) 382 elif tag is ProcessingInstruction: 383 write("<?%s?>" % _tounicode(text)) 384 else: 385 tag = qnames[_tounicode(tag) if tag is not None else None] 386 if tag is None: 387 if text: 388 write(_escape_cdata(text)) 389 for e in elem: 390 _serialize_xml(write, e, qnames, None) 391 else: 392 write("<" + tag) 393 if namespaces: 394 for uri, prefix in sorted( 395 namespaces.items(), key=lambda x: x[1] 396 ): # sort on prefix 397 if prefix: 398 prefix = ":" + prefix 399 write(' xmlns%s="%s"' % (prefix, _escape_attrib(uri))) 400 attrs = elem.attrib 401 if attrs: 402 # try to keep existing attrib order 403 if len(attrs) <= 1 or type(attrs) is _Attrib: 404 items = attrs.items() 405 else: 406 # if plain dict, use lexical order 407 items = sorted(attrs.items()) 408 for k, v in items: 409 if isinstance(k, QName): 410 k = _tounicode(k.text) 411 else: 412 k = _tounicode(k) 413 if isinstance(v, QName): 414 v = qnames[_tounicode(v.text)] 415 else: 416 v = _escape_attrib(v) 417 write(' %s="%s"' % (qnames[k], v)) 418 if text is not None or len(elem): 419 write(">") 420 if text: 421 write(_escape_cdata(text)) 422 for e in elem: 423 _serialize_xml(write, e, qnames, None) 424 write("</" + tag + ">") 425 else: 426 write("/>") 427 if elem.tail: 428 write(_escape_cdata(elem.tail)) 429 430 def _raise_serialization_error(text): 431 raise TypeError( 432 "cannot serialize %r (type %s)" % (text, type(text).__name__) 433 ) 434 435 def _escape_cdata(text): 436 # escape character data 437 try: 438 text = _tounicode(text) 439 # it's worth avoiding do-nothing calls for short strings 440 if "&" in text: 441 text = text.replace("&", "&") 442 if "<" in text: 443 text = text.replace("<", "<") 444 if ">" in text: 445 text = text.replace(">", ">") 446 return text 447 except (TypeError, AttributeError): 448 _raise_serialization_error(text) 449 450 def _escape_attrib(text): 451 # escape attribute value 452 try: 453 text = _tounicode(text) 454 if "&" in text: 455 text = text.replace("&", "&") 456 if "<" in text: 457 text = text.replace("<", "<") 458 if ">" in text: 459 text = text.replace(">", ">") 460 if '"' in text: 461 text = text.replace('"', """) 462 if "\n" in text: 463 text = text.replace("\n", " ") 464 return text 465 except (TypeError, AttributeError): 466 _raise_serialization_error(text) 467 468 def _indent(elem, level=0): 469 # From http://effbot.org/zone/element-lib.htm#prettyprint 470 i = "\n" + level * " " 471 if len(elem): 472 if not elem.text or not elem.text.strip(): 473 elem.text = i + " " 474 if not elem.tail or not elem.tail.strip(): 475 elem.tail = i 476 for elem in elem: 477 _indent(elem, level + 1) 478 if not elem.tail or not elem.tail.strip(): 479 elem.tail = i 480 else: 481 if level and (not elem.tail or not elem.tail.strip()): 482 elem.tail = i 483