1from __future__ import absolute_import, division, unicode_literals 2 3import os 4import unittest 5 6try: 7 unittest.TestCase.assertEqual 8except AttributeError: 9 unittest.TestCase.assertEqual = unittest.TestCase.assertEquals 10 11from .support import get_data_files, TestData, test_dir, errorMessage 12from html5lib import HTMLParser, inputstream 13 14 15class Html5EncodingTestCase(unittest.TestCase): 16 def test_codec_name_a(self): 17 self.assertEqual(inputstream.codecName("utf-8"), "utf-8") 18 19 def test_codec_name_b(self): 20 self.assertEqual(inputstream.codecName("utf8"), "utf-8") 21 22 def test_codec_name_c(self): 23 self.assertEqual(inputstream.codecName(" utf8 "), "utf-8") 24 25 def test_codec_name_d(self): 26 self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") 27 28 29def runParserEncodingTest(data, encoding): 30 p = HTMLParser() 31 assert p.documentEncoding is None 32 p.parse(data, useChardet=False) 33 encoding = encoding.lower().decode("ascii") 34 35 assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) 36 37 38def runPreScanEncodingTest(data, encoding): 39 stream = inputstream.HTMLBinaryInputStream(data, chardet=False) 40 encoding = encoding.lower().decode("ascii") 41 42 # Very crude way to ignore irrelevant tests 43 if len(data) > stream.numBytesMeta: 44 return 45 46 assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0]) 47 48 49def test_encoding(): 50 for filename in get_data_files("encoding"): 51 tests = TestData(filename, b"data", encoding=None) 52 for idx, test in enumerate(tests): 53 yield (runParserEncodingTest, test[b'data'], test[b'encoding']) 54 yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) 55 56try: 57 try: 58 import charade # flake8: noqa 59 except ImportError: 60 import chardet # flake8: noqa 61except ImportError: 62 print("charade/chardet not found, skipping chardet tests") 63else: 64 def test_chardet(): 65 with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: 66 encoding = inputstream.HTMLInputStream(fp.read()).charEncoding 67 assert encoding[0].lower() == "big5" 68