• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import absolute_import, division, unicode_literals
2
3import json
4import unittest
5
6from .support import get_data_files
7
8try:
9    unittest.TestCase.assertEqual
10except AttributeError:
11    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
12
13import html5lib
14from html5lib import constants
15from html5lib.serializer import HTMLSerializer, serialize
16from html5lib.treewalkers._base import TreeWalker
17
18optionals_loaded = []
19
20try:
21    from lxml import etree
22    optionals_loaded.append("lxml")
23except ImportError:
24    pass
25
26default_namespace = constants.namespaces["html"]
27
28
29class JsonWalker(TreeWalker):
30    def __iter__(self):
31        for token in self.tree:
32            type = token[0]
33            if type == "StartTag":
34                if len(token) == 4:
35                    namespace, name, attrib = token[1:4]
36                else:
37                    namespace = default_namespace
38                    name, attrib = token[1:3]
39                yield self.startTag(namespace, name, self._convertAttrib(attrib))
40            elif type == "EndTag":
41                if len(token) == 3:
42                    namespace, name = token[1:3]
43                else:
44                    namespace = default_namespace
45                    name = token[1]
46                yield self.endTag(namespace, name)
47            elif type == "EmptyTag":
48                if len(token) == 4:
49                    namespace, name, attrib = token[1:]
50                else:
51                    namespace = default_namespace
52                    name, attrib = token[1:]
53                for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
54                    yield token
55            elif type == "Comment":
56                yield self.comment(token[1])
57            elif type in ("Characters", "SpaceCharacters"):
58                for token in self.text(token[1]):
59                    yield token
60            elif type == "Doctype":
61                if len(token) == 4:
62                    yield self.doctype(token[1], token[2], token[3])
63                elif len(token) == 3:
64                    yield self.doctype(token[1], token[2])
65                else:
66                    yield self.doctype(token[1])
67            else:
68                raise ValueError("Unknown token type: " + type)
69
70    def _convertAttrib(self, attribs):
71        """html5lib tree-walkers use a dict of (namespace, name): value for
72        attributes, but JSON cannot represent this. Convert from the format
73        in the serializer tests (a list of dicts with "namespace", "name",
74        and "value" as keys) to html5lib's tree-walker format."""
75        attrs = {}
76        for attrib in attribs:
77            name = (attrib["namespace"], attrib["name"])
78            assert(name not in attrs)
79            attrs[name] = attrib["value"]
80        return attrs
81
82
83def serialize_html(input, options):
84    options = dict([(str(k), v) for k, v in options.items()])
85    stream = JsonWalker(input)
86    serializer = HTMLSerializer(alphabetical_attributes=True, **options)
87    return serializer.render(stream, options.get("encoding", None))
88
89
90def runSerializerTest(input, expected, options):
91    encoding = options.get("encoding", None)
92
93    if encoding:
94        encode = lambda x: x.encode(encoding)
95        expected = list(map(encode, expected))
96
97    result = serialize_html(input, options)
98    if len(expected) == 1:
99        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
100    elif result not in expected:
101        assert False, "Expected: %s, Received: %s" % (expected, result)
102
103
104class EncodingTestCase(unittest.TestCase):
105    def throwsWithLatin1(self, input):
106        self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
107
108    def testDoctypeName(self):
109        self.throwsWithLatin1([["Doctype", "\u0101"]])
110
111    def testDoctypePublicId(self):
112        self.throwsWithLatin1([["Doctype", "potato", "\u0101"]])
113
114    def testDoctypeSystemId(self):
115        self.throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
116
117    def testCdataCharacters(self):
118        runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
119                          ["<style>&amacr;"], {"encoding": "iso-8859-1"})
120
121    def testCharacters(self):
122        runSerializerTest([["Characters", "\u0101"]],
123                          ["&amacr;"], {"encoding": "iso-8859-1"})
124
125    def testStartTagName(self):
126        self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
127
128    def testEmptyTagName(self):
129        self.throwsWithLatin1([["EmptyTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
130
131    def testAttributeName(self):
132        self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
133
134    def testAttributeValue(self):
135        runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
136                            [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
137                          ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
138
139    def testEndTagName(self):
140        self.throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
141
142    def testComment(self):
143        self.throwsWithLatin1([["Comment", "\u0101"]])
144
145
146if "lxml" in optionals_loaded:
147    class LxmlTestCase(unittest.TestCase):
148        def setUp(self):
149            self.parser = etree.XMLParser(resolve_entities=False)
150            self.treewalker = html5lib.getTreeWalker("lxml")
151            self.serializer = HTMLSerializer()
152
153        def testEntityReplacement(self):
154            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
155            tree = etree.fromstring(doc, parser=self.parser).getroottree()
156            result = serialize(tree, tree="lxml", omit_optional_tags=False)
157            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
158
159        def testEntityXML(self):
160            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
161            tree = etree.fromstring(doc, parser=self.parser).getroottree()
162            result = serialize(tree, tree="lxml", omit_optional_tags=False)
163            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
164
165        def testEntityNoResolve(self):
166            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
167            tree = etree.fromstring(doc, parser=self.parser).getroottree()
168            result = serialize(tree, tree="lxml", omit_optional_tags=False,
169                                          resolve_entities=False)
170            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
171
172
173def test_serializer():
174    for filename in get_data_files('serializer', '*.test'):
175        with open(filename) as fp:
176            tests = json.load(fp)
177            for index, test in enumerate(tests['tests']):
178                yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
179