• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import ast
11import unittest
12import unicodedata
13
14from test import support
15from http.client import HTTPException
16
17try:
18    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
19except ImportError:
20    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
21
22class UnicodeNamesTest(unittest.TestCase):
23
24    def checkletter(self, name, code):
25        # Helper that put all \N escapes inside eval'd raw strings,
26        # to make sure this script runs even if the compiler
27        # chokes on \N escapes
28        res = ast.literal_eval(r'"\N{%s}"' % name)
29        self.assertEqual(res, code)
30        return res
31
32    def test_general(self):
33        # General and case insensitivity test:
34        chars = [
35            "LATIN CAPITAL LETTER T",
36            "LATIN SMALL LETTER H",
37            "LATIN SMALL LETTER E",
38            "SPACE",
39            "LATIN SMALL LETTER R",
40            "LATIN CAPITAL LETTER E",
41            "LATIN SMALL LETTER D",
42            "SPACE",
43            "LATIN SMALL LETTER f",
44            "LATIN CAPITAL LeTtEr o",
45            "LATIN SMaLl LETTER x",
46            "SPACE",
47            "LATIN SMALL LETTER A",
48            "LATIN SMALL LETTER T",
49            "LATIN SMALL LETTER E",
50            "SPACE",
51            "LATIN SMALL LETTER T",
52            "LATIN SMALL LETTER H",
53            "LATIN SMALL LETTER E",
54            "SpAcE",
55            "LATIN SMALL LETTER S",
56            "LATIN SMALL LETTER H",
57            "LATIN small LETTER e",
58            "LATIN small LETTER e",
59            "LATIN SMALL LETTER P",
60            "FULL STOP"
61        ]
62        string = "The rEd fOx ate the sheep."
63
64        self.assertEqual(
65            "".join([self.checkletter(*args) for args in zip(chars, string)]),
66            string
67        )
68
69    def test_ascii_letters(self):
70        for char in "".join(map(chr, range(ord("a"), ord("z")))):
71            name = "LATIN SMALL LETTER %s" % char.upper()
72            code = unicodedata.lookup(name)
73            self.assertEqual(unicodedata.name(code), name)
74
75    def test_hangul_syllables(self):
76        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
77        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
78        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
79        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
80        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
81        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
82        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
83        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
84        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
85        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
86        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
87        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
88        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
89
90        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
91
92    def test_cjk_unified_ideographs(self):
93        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
94        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
95        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
96        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
97        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
98        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
99        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
100        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
101        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
102        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
103        self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
104
105    def test_bmp_characters(self):
106        for code in range(0x10000):
107            char = chr(code)
108            name = unicodedata.name(char, None)
109            if name is not None:
110                self.assertEqual(unicodedata.lookup(name), char)
111
112    def test_misc_symbols(self):
113        self.checkletter("PILCROW SIGN", "\u00b6")
114        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
115        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
116        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
117
118    def test_aliases(self):
119        # Check that the aliases defined in the NameAliases.txt file work.
120        # This should be updated when new aliases are added or the file
121        # should be downloaded and parsed instead.  See #12753.
122        aliases = [
123            ('LATIN CAPITAL LETTER GHA', 0x01A2),
124            ('LATIN SMALL LETTER GHA', 0x01A3),
125            ('KANNADA LETTER LLLA', 0x0CDE),
126            ('LAO LETTER FO FON', 0x0E9D),
127            ('LAO LETTER FO FAY', 0x0E9F),
128            ('LAO LETTER RO', 0x0EA3),
129            ('LAO LETTER LO', 0x0EA5),
130            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
131            ('YI SYLLABLE ITERATION MARK', 0xA015),
132            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
133            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
134        ]
135        for alias, codepoint in aliases:
136            self.checkletter(alias, chr(codepoint))
137            name = unicodedata.name(chr(codepoint))
138            self.assertNotEqual(name, alias)
139            self.assertEqual(unicodedata.lookup(alias),
140                             unicodedata.lookup(name))
141            with self.assertRaises(KeyError):
142                unicodedata.ucd_3_2_0.lookup(alias)
143
144    def test_aliases_names_in_pua_range(self):
145        # We are storing aliases in the PUA 15, but their names shouldn't leak
146        for cp in range(0xf0000, 0xf0100):
147            with self.assertRaises(ValueError) as cm:
148                unicodedata.name(chr(cp))
149            self.assertEqual(str(cm.exception), 'no such name')
150
151    def test_named_sequences_names_in_pua_range(self):
152        # We are storing named seq in the PUA 15, but their names shouldn't leak
153        for cp in range(0xf0100, 0xf0fff):
154            with self.assertRaises(ValueError) as cm:
155                unicodedata.name(chr(cp))
156            self.assertEqual(str(cm.exception), 'no such name')
157
158    def test_named_sequences_sample(self):
159        # Check a few named sequences.  See #12753.
160        sequences = [
161            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
162            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
163            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
164            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
165            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
166        ]
167        for seqname, codepoints in sequences:
168            self.assertEqual(unicodedata.lookup(seqname), codepoints)
169            with self.assertRaises(SyntaxError):
170                self.checkletter(seqname, None)
171            with self.assertRaises(KeyError):
172                unicodedata.ucd_3_2_0.lookup(seqname)
173
174    def test_named_sequences_full(self):
175        # Check all the named sequences
176        def check_version(testfile):
177            hdr = testfile.readline()
178            return unicodedata.unidata_version in hdr
179        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
180               unicodedata.unidata_version)
181        try:
182            testdata = support.open_urlresource(url, encoding="utf-8",
183                                                check=check_version)
184        except (OSError, HTTPException):
185            self.skipTest("Could not retrieve " + url)
186        self.addCleanup(testdata.close)
187        for line in testdata:
188            line = line.strip()
189            if not line or line.startswith('#'):
190                continue
191            seqname, codepoints = line.split(';')
192            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
193            self.assertEqual(unicodedata.lookup(seqname), codepoints)
194            with self.assertRaises(SyntaxError):
195                self.checkletter(seqname, None)
196            with self.assertRaises(KeyError):
197                unicodedata.ucd_3_2_0.lookup(seqname)
198
199    def test_errors(self):
200        self.assertRaises(TypeError, unicodedata.name)
201        self.assertRaises(TypeError, unicodedata.name, 'xx')
202        self.assertRaises(TypeError, unicodedata.lookup)
203        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
204
205    def test_strict_error_handling(self):
206        # bogus character name
207        self.assertRaises(
208            UnicodeError,
209            str, b"\\N{blah}", 'unicode-escape', 'strict'
210        )
211        # long bogus character name
212        self.assertRaises(
213            UnicodeError,
214            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
215        )
216        # missing closing brace
217        self.assertRaises(
218            UnicodeError,
219            str, b"\\N{SPACE", 'unicode-escape', 'strict'
220        )
221        # missing opening brace
222        self.assertRaises(
223            UnicodeError,
224            str, b"\\NSPACE", 'unicode-escape', 'strict'
225        )
226
227    @support.cpython_only
228    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
229    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
230    def test_issue16335(self, size):
231        # very very long bogus character name
232        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
233        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
234        self.assertRaisesRegex(UnicodeError,
235            'unknown Unicode character name',
236            x.decode, 'unicode-escape'
237        )
238
239
240if __name__ == "__main__":
241    unittest.main()
242