• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# markdown/searializers.py
2#
3# Add x/html serialization to Elementree
4# Taken from ElementTree 1.3 preview with slight modifications
5#
6# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
7#
8# fredrik@pythonware.com
9# https://www.pythonware.com/
10#
11# --------------------------------------------------------------------
12# The ElementTree toolkit is
13#
14# Copyright (c) 1999-2007 by Fredrik Lundh
15#
16# By obtaining, using, and/or copying this software and/or its
17# associated documentation, you agree that you have read, understood,
18# and will comply with the following terms and conditions:
19#
20# Permission to use, copy, modify, and distribute this software and
21# its associated documentation for any purpose and without fee is
22# hereby granted, provided that the above copyright notice appears in
23# all copies, and that both that copyright notice and this permission
24# notice appear in supporting documentation, and that the name of
25# Secret Labs AB or the author not be used in advertising or publicity
26# pertaining to distribution of the software without specific, written
27# prior permission.
28#
29# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
30# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
31# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
32# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
33# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
34# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
35# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
36# OF THIS SOFTWARE.
37# --------------------------------------------------------------------
38
39
40from xml.etree.ElementTree import ProcessingInstruction
41from xml.etree.ElementTree import Comment, ElementTree, QName
42import re
43
44__all__ = ['to_html_string', 'to_xhtml_string']
45
46HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
47              "img", "input", "isindex", "link", "meta", "param")
48RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I)
49
50try:
51    HTML_EMPTY = set(HTML_EMPTY)
52except NameError:  # pragma: no cover
53    pass
54
55
56def _raise_serialization_error(text):  # pragma: no cover
57    raise TypeError(
58        "cannot serialize {!r} (type {})".format(text, type(text).__name__)
59        )
60
61
62def _escape_cdata(text):
63    # escape character data
64    try:
65        # it's worth avoiding do-nothing calls for strings that are
66        # shorter than 500 character, or so.  assume that's, by far,
67        # the most common case in most applications.
68        if "&" in text:
69            # Only replace & when not part of an entity
70            text = RE_AMP.sub('&', text)
71        if "<" in text:
72            text = text.replace("<", "&lt;")
73        if ">" in text:
74            text = text.replace(">", "&gt;")
75        return text
76    except (TypeError, AttributeError):  # pragma: no cover
77        _raise_serialization_error(text)
78
79
80def _escape_attrib(text):
81    # escape attribute value
82    try:
83        if "&" in text:
84            # Only replace & when not part of an entity
85            text = RE_AMP.sub('&amp;', text)
86        if "<" in text:
87            text = text.replace("<", "&lt;")
88        if ">" in text:
89            text = text.replace(">", "&gt;")
90        if "\"" in text:
91            text = text.replace("\"", "&quot;")
92        if "\n" in text:
93            text = text.replace("\n", "&#10;")
94        return text
95    except (TypeError, AttributeError):  # pragma: no cover
96        _raise_serialization_error(text)
97
98
99def _escape_attrib_html(text):
100    # escape attribute value
101    try:
102        if "&" in text:
103            # Only replace & when not part of an entity
104            text = RE_AMP.sub('&amp;', text)
105        if "<" in text:
106            text = text.replace("<", "&lt;")
107        if ">" in text:
108            text = text.replace(">", "&gt;")
109        if "\"" in text:
110            text = text.replace("\"", "&quot;")
111        return text
112    except (TypeError, AttributeError):  # pragma: no cover
113        _raise_serialization_error(text)
114
115
116def _serialize_html(write, elem, format):
117    tag = elem.tag
118    text = elem.text
119    if tag is Comment:
120        write("<!--%s-->" % _escape_cdata(text))
121    elif tag is ProcessingInstruction:
122        write("<?%s?>" % _escape_cdata(text))
123    elif tag is None:
124        if text:
125            write(_escape_cdata(text))
126        for e in elem:
127            _serialize_html(write, e, format)
128    else:
129        namespace_uri = None
130        if isinstance(tag, QName):
131            # QNAME objects store their data as a string: `{uri}tag`
132            if tag.text[:1] == "{":
133                namespace_uri, tag = tag.text[1:].split("}", 1)
134            else:
135                raise ValueError('QName objects must define a tag.')
136        write("<" + tag)
137        items = elem.items()
138        if items:
139            items = sorted(items)  # lexical order
140            for k, v in items:
141                if isinstance(k, QName):
142                    # Assume a text only QName
143                    k = k.text
144                if isinstance(v, QName):
145                    # Assume a text only QName
146                    v = v.text
147                else:
148                    v = _escape_attrib_html(v)
149                if k == v and format == 'html':
150                    # handle boolean attributes
151                    write(" %s" % v)
152                else:
153                    write(' {}="{}"'.format(k, v))
154        if namespace_uri:
155            write(' xmlns="%s"' % (_escape_attrib(namespace_uri)))
156        if format == "xhtml" and tag.lower() in HTML_EMPTY:
157            write(" />")
158        else:
159            write(">")
160            if text:
161                if tag.lower() in ["script", "style"]:
162                    write(text)
163                else:
164                    write(_escape_cdata(text))
165            for e in elem:
166                _serialize_html(write, e, format)
167            if tag.lower() not in HTML_EMPTY:
168                write("</" + tag + ">")
169    if elem.tail:
170        write(_escape_cdata(elem.tail))
171
172
173def _write_html(root, format="html"):
174    assert root is not None
175    data = []
176    write = data.append
177    _serialize_html(write, root, format)
178    return "".join(data)
179
180
181# --------------------------------------------------------------------
182# public functions
183
184def to_html_string(element):
185    return _write_html(ElementTree(element).getroot(), format="html")
186
187
188def to_xhtml_string(element):
189    return _write_html(ElementTree(element).getroot(), format="xhtml")
190