• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import absolute_import, division, unicode_literals
2from six import text_type
3
4try:
5    from functools import reduce
6except ImportError:
7    pass
8
9from ..constants import voidElements, booleanAttributes, spaceCharacters
10from ..constants import rcdataElements, entities, xmlEntities
11from .. import utils
12from xml.sax.saxutils import escape
13
14spaceCharacters = "".join(spaceCharacters)
15
16try:
17    from codecs import register_error, xmlcharrefreplace_errors
18except ImportError:
19    unicode_encode_errors = "strict"
20else:
21    unicode_encode_errors = "htmlentityreplace"
22
23    encode_entity_map = {}
24    is_ucs4 = len("\U0010FFFF") == 1
25    for k, v in list(entities.items()):
26        # skip multi-character entities
27        if ((is_ucs4 and len(v) > 1) or
28                (not is_ucs4 and len(v) > 2)):
29            continue
30        if v != "&":
31            if len(v) == 2:
32                v = utils.surrogatePairToCodepoint(v)
33            else:
34                v = ord(v)
35            if v not in encode_entity_map or k.islower():
36                # prefer < over < and similarly for &, >, etc.
37                encode_entity_map[v] = k
38
39    def htmlentityreplace_errors(exc):
40        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
41            res = []
42            codepoints = []
43            skip = False
44            for i, c in enumerate(exc.object[exc.start:exc.end]):
45                if skip:
46                    skip = False
47                    continue
48                index = i + exc.start
49                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
50                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
51                    skip = True
52                else:
53                    codepoint = ord(c)
54                codepoints.append(codepoint)
55            for cp in codepoints:
56                e = encode_entity_map.get(cp)
57                if e:
58                    res.append("&")
59                    res.append(e)
60                    if not e.endswith(";"):
61                        res.append(";")
62                else:
63                    res.append("&#x%s;" % (hex(cp)[2:]))
64            return ("".join(res), exc.end)
65        else:
66            return xmlcharrefreplace_errors(exc)
67
68    register_error(unicode_encode_errors, htmlentityreplace_errors)
69
70    del register_error
71
72
73class HTMLSerializer(object):
74
75    # attribute quoting options
76    quote_attr_values = False
77    quote_char = '"'
78    use_best_quote_char = True
79
80    # tag syntax options
81    omit_optional_tags = True
82    minimize_boolean_attributes = True
83    use_trailing_solidus = False
84    space_before_trailing_solidus = True
85
86    # escaping options
87    escape_lt_in_attrs = False
88    escape_rcdata = False
89    resolve_entities = True
90
91    # miscellaneous options
92    alphabetical_attributes = False
93    inject_meta_charset = True
94    strip_whitespace = False
95    sanitize = False
96
97    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
98               "omit_optional_tags", "minimize_boolean_attributes",
99               "use_trailing_solidus", "space_before_trailing_solidus",
100               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
101               "alphabetical_attributes", "inject_meta_charset",
102               "strip_whitespace", "sanitize")
103
104    def __init__(self, **kwargs):
105        """Initialize HTMLSerializer.
106
107        Keyword options (default given first unless specified) include:
108
109        inject_meta_charset=True|False
110          Whether it insert a meta element to define the character set of the
111          document.
112        quote_attr_values=True|False
113          Whether to quote attribute values that don't require quoting
114          per HTML5 parsing rules.
115        quote_char=u'"'|u"'"
116          Use given quote character for attribute quoting. Default is to
117          use double quote unless attribute value contains a double quote,
118          in which case single quotes are used instead.
119        escape_lt_in_attrs=False|True
120          Whether to escape < in attribute values.
121        escape_rcdata=False|True
122          Whether to escape characters that need to be escaped within normal
123          elements within rcdata elements such as style.
124        resolve_entities=True|False
125          Whether to resolve named character entities that appear in the
126          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
127          are unaffected by this setting.
128        strip_whitespace=False|True
129          Whether to remove semantically meaningless whitespace. (This
130          compresses all whitespace to a single space except within pre.)
131        minimize_boolean_attributes=True|False
132          Shortens boolean attributes to give just the attribute value,
133          for example <input disabled="disabled"> becomes <input disabled>.
134        use_trailing_solidus=False|True
135          Includes a close-tag slash at the end of the start tag of void
136          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
137        space_before_trailing_solidus=True|False
138          Places a space immediately before the closing slash in a tag
139          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
140        sanitize=False|True
141          Strip all unsafe or unknown constructs from output.
142          See `html5lib user documentation`_
143        omit_optional_tags=True|False
144          Omit start/end tags that are optional.
145        alphabetical_attributes=False|True
146          Reorder attributes to be in alphabetical order.
147
148        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
149        """
150        if 'quote_char' in kwargs:
151            self.use_best_quote_char = False
152        for attr in self.options:
153            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
154        self.errors = []
155        self.strict = False
156
157    def encode(self, string):
158        assert(isinstance(string, text_type))
159        if self.encoding:
160            return string.encode(self.encoding, unicode_encode_errors)
161        else:
162            return string
163
164    def encodeStrict(self, string):
165        assert(isinstance(string, text_type))
166        if self.encoding:
167            return string.encode(self.encoding, "strict")
168        else:
169            return string
170
171    def serialize(self, treewalker, encoding=None):
172        self.encoding = encoding
173        in_cdata = False
174        self.errors = []
175
176        if encoding and self.inject_meta_charset:
177            from ..filters.inject_meta_charset import Filter
178            treewalker = Filter(treewalker, encoding)
179        # WhitespaceFilter should be used before OptionalTagFilter
180        # for maximum efficiently of this latter filter
181        if self.strip_whitespace:
182            from ..filters.whitespace import Filter
183            treewalker = Filter(treewalker)
184        if self.sanitize:
185            from ..filters.sanitizer import Filter
186            treewalker = Filter(treewalker)
187        if self.omit_optional_tags:
188            from ..filters.optionaltags import Filter
189            treewalker = Filter(treewalker)
190        # Alphabetical attributes must be last, as other filters
191        # could add attributes and alter the order
192        if self.alphabetical_attributes:
193            from ..filters.alphabeticalattributes import Filter
194            treewalker = Filter(treewalker)
195
196        for token in treewalker:
197            type = token["type"]
198            if type == "Doctype":
199                doctype = "<!DOCTYPE %s" % token["name"]
200
201                if token["publicId"]:
202                    doctype += ' PUBLIC "%s"' % token["publicId"]
203                elif token["systemId"]:
204                    doctype += " SYSTEM"
205                if token["systemId"]:
206                    if token["systemId"].find('"') >= 0:
207                        if token["systemId"].find("'") >= 0:
208                            self.serializeError("System identifer contains both single and double quote characters")
209                        quote_char = "'"
210                    else:
211                        quote_char = '"'
212                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
213
214                doctype += ">"
215                yield self.encodeStrict(doctype)
216
217            elif type in ("Characters", "SpaceCharacters"):
218                if type == "SpaceCharacters" or in_cdata:
219                    if in_cdata and token["data"].find("</") >= 0:
220                        self.serializeError("Unexpected </ in CDATA")
221                    yield self.encode(token["data"])
222                else:
223                    yield self.encode(escape(token["data"]))
224
225            elif type in ("StartTag", "EmptyTag"):
226                name = token["name"]
227                yield self.encodeStrict("<%s" % name)
228                if name in rcdataElements and not self.escape_rcdata:
229                    in_cdata = True
230                elif in_cdata:
231                    self.serializeError("Unexpected child element of a CDATA element")
232                for (attr_namespace, attr_name), attr_value in token["data"].items():
233                    # TODO: Add namespace support here
234                    k = attr_name
235                    v = attr_value
236                    yield self.encodeStrict(' ')
237
238                    yield self.encodeStrict(k)
239                    if not self.minimize_boolean_attributes or \
240                        (k not in booleanAttributes.get(name, tuple())
241                         and k not in booleanAttributes.get("", tuple())):
242                        yield self.encodeStrict("=")
243                        if self.quote_attr_values or not v:
244                            quote_attr = True
245                        else:
246                            quote_attr = reduce(lambda x, y: x or (y in v),
247                                                spaceCharacters + ">\"'=", False)
248                        v = v.replace("&", "&amp;")
249                        if self.escape_lt_in_attrs:
250                            v = v.replace("<", "&lt;")
251                        if quote_attr:
252                            quote_char = self.quote_char
253                            if self.use_best_quote_char:
254                                if "'" in v and '"' not in v:
255                                    quote_char = '"'
256                                elif '"' in v and "'" not in v:
257                                    quote_char = "'"
258                            if quote_char == "'":
259                                v = v.replace("'", "&#39;")
260                            else:
261                                v = v.replace('"', "&quot;")
262                            yield self.encodeStrict(quote_char)
263                            yield self.encode(v)
264                            yield self.encodeStrict(quote_char)
265                        else:
266                            yield self.encode(v)
267                if name in voidElements and self.use_trailing_solidus:
268                    if self.space_before_trailing_solidus:
269                        yield self.encodeStrict(" /")
270                    else:
271                        yield self.encodeStrict("/")
272                yield self.encode(">")
273
274            elif type == "EndTag":
275                name = token["name"]
276                if name in rcdataElements:
277                    in_cdata = False
278                elif in_cdata:
279                    self.serializeError("Unexpected child element of a CDATA element")
280                yield self.encodeStrict("</%s>" % name)
281
282            elif type == "Comment":
283                data = token["data"]
284                if data.find("--") >= 0:
285                    self.serializeError("Comment contains --")
286                yield self.encodeStrict("<!--%s-->" % token["data"])
287
288            elif type == "Entity":
289                name = token["name"]
290                key = name + ";"
291                if key not in entities:
292                    self.serializeError("Entity %s not recognized" % name)
293                if self.resolve_entities and key not in xmlEntities:
294                    data = entities[key]
295                else:
296                    data = "&%s;" % name
297                yield self.encodeStrict(data)
298
299            else:
300                self.serializeError(token["data"])
301
302    def render(self, treewalker, encoding=None):
303        if encoding:
304            return b"".join(list(self.serialize(treewalker, encoding)))
305        else:
306            return "".join(list(self.serialize(treewalker)))
307
308    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
309        # XXX The idea is to make data mandatory.
310        self.errors.append(data)
311        if self.strict:
312            raise SerializeError
313
314
315def SerializeError(Exception):
316    """Error in serialized tree"""
317    pass
318