• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import absolute_import, division, unicode_literals
2
3import os
4import unittest
5
6try:
7    unittest.TestCase.assertEqual
8except AttributeError:
9    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
10
11from .support import get_data_files, TestData, test_dir, errorMessage
12from html5lib import HTMLParser, inputstream
13
14
15class Html5EncodingTestCase(unittest.TestCase):
16    def test_codec_name_a(self):
17        self.assertEqual(inputstream.codecName("utf-8"), "utf-8")
18
19    def test_codec_name_b(self):
20        self.assertEqual(inputstream.codecName("utf8"), "utf-8")
21
22    def test_codec_name_c(self):
23        self.assertEqual(inputstream.codecName("  utf8  "), "utf-8")
24
25    def test_codec_name_d(self):
26        self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
27
28
29def runParserEncodingTest(data, encoding):
30    p = HTMLParser()
31    assert p.documentEncoding is None
32    p.parse(data, useChardet=False)
33    encoding = encoding.lower().decode("ascii")
34
35    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
36
37
38def runPreScanEncodingTest(data, encoding):
39    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
40    encoding = encoding.lower().decode("ascii")
41
42    # Very crude way to ignore irrelevant tests
43    if len(data) > stream.numBytesMeta:
44        return
45
46    assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0])
47
48
49def test_encoding():
50    for filename in get_data_files("encoding"):
51        tests = TestData(filename, b"data", encoding=None)
52        for idx, test in enumerate(tests):
53            yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
54            yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
55
56try:
57    try:
58        import charade  # flake8: noqa
59    except ImportError:
60        import chardet  # flake8: noqa
61except ImportError:
62    print("charade/chardet not found, skipping chardet tests")
63else:
64    def test_chardet():
65        with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
66            encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
67            assert encoding[0].lower() == "big5"
68