• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import ast
11import unittest
12import unicodedata
13
14from test import support
15from http.client import HTTPException
16from test.test_normalization import check_version
17
18try:
19    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
20except ImportError:
21    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
22
23class UnicodeNamesTest(unittest.TestCase):
24
25    def checkletter(self, name, code):
26        # Helper that put all \N escapes inside eval'd raw strings,
27        # to make sure this script runs even if the compiler
28        # chokes on \N escapes
29        res = ast.literal_eval(r'"\N{%s}"' % name)
30        self.assertEqual(res, code)
31        return res
32
33    def test_general(self):
34        # General and case insensitivity test:
35        chars = [
36            "LATIN CAPITAL LETTER T",
37            "LATIN SMALL LETTER H",
38            "LATIN SMALL LETTER E",
39            "SPACE",
40            "LATIN SMALL LETTER R",
41            "LATIN CAPITAL LETTER E",
42            "LATIN SMALL LETTER D",
43            "SPACE",
44            "LATIN SMALL LETTER f",
45            "LATIN CAPITAL LeTtEr o",
46            "LATIN SMaLl LETTER x",
47            "SPACE",
48            "LATIN SMALL LETTER A",
49            "LATIN SMALL LETTER T",
50            "LATIN SMALL LETTER E",
51            "SPACE",
52            "LATIN SMALL LETTER T",
53            "LATIN SMALL LETTER H",
54            "LATIN SMALL LETTER E",
55            "SpAcE",
56            "LATIN SMALL LETTER S",
57            "LATIN SMALL LETTER H",
58            "LATIN small LETTER e",
59            "LATIN small LETTER e",
60            "LATIN SMALL LETTER P",
61            "FULL STOP"
62        ]
63        string = "The rEd fOx ate the sheep."
64
65        self.assertEqual(
66            "".join([self.checkletter(*args) for args in zip(chars, string)]),
67            string
68        )
69
70    def test_ascii_letters(self):
71        for char in "".join(map(chr, range(ord("a"), ord("z")))):
72            name = "LATIN SMALL LETTER %s" % char.upper()
73            code = unicodedata.lookup(name)
74            self.assertEqual(unicodedata.name(code), name)
75
76    def test_hangul_syllables(self):
77        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
78        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
79        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
80        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
81        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
82        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
83        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
84        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
85        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
86        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
87        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
88        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
89        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
90
91        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
92
93    def test_cjk_unified_ideographs(self):
94        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
95        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
96        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
97        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
98        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
99        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
100        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
101        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
102        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
103        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
104
105    def test_bmp_characters(self):
106        for code in range(0x10000):
107            char = chr(code)
108            name = unicodedata.name(char, None)
109            if name is not None:
110                self.assertEqual(unicodedata.lookup(name), char)
111
112    def test_misc_symbols(self):
113        self.checkletter("PILCROW SIGN", "\u00b6")
114        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
115        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
116        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
117
118    def test_aliases(self):
119        # Check that the aliases defined in the NameAliases.txt file work.
120        # This should be updated when new aliases are added or the file
121        # should be downloaded and parsed instead.  See #12753.
122        aliases = [
123            ('LATIN CAPITAL LETTER GHA', 0x01A2),
124            ('LATIN SMALL LETTER GHA', 0x01A3),
125            ('KANNADA LETTER LLLA', 0x0CDE),
126            ('LAO LETTER FO FON', 0x0E9D),
127            ('LAO LETTER FO FAY', 0x0E9F),
128            ('LAO LETTER RO', 0x0EA3),
129            ('LAO LETTER LO', 0x0EA5),
130            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
131            ('YI SYLLABLE ITERATION MARK', 0xA015),
132            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
133            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
134        ]
135        for alias, codepoint in aliases:
136            self.checkletter(alias, chr(codepoint))
137            name = unicodedata.name(chr(codepoint))
138            self.assertNotEqual(name, alias)
139            self.assertEqual(unicodedata.lookup(alias),
140                             unicodedata.lookup(name))
141            with self.assertRaises(KeyError):
142                unicodedata.ucd_3_2_0.lookup(alias)
143
144    def test_aliases_names_in_pua_range(self):
145        # We are storing aliases in the PUA 15, but their names shouldn't leak
146        for cp in range(0xf0000, 0xf0100):
147            with self.assertRaises(ValueError) as cm:
148                unicodedata.name(chr(cp))
149            self.assertEqual(str(cm.exception), 'no such name')
150
151    def test_named_sequences_names_in_pua_range(self):
152        # We are storing named seq in the PUA 15, but their names shouldn't leak
153        for cp in range(0xf0100, 0xf0fff):
154            with self.assertRaises(ValueError) as cm:
155                unicodedata.name(chr(cp))
156            self.assertEqual(str(cm.exception), 'no such name')
157
158    def test_named_sequences_sample(self):
159        # Check a few named sequences.  See #12753.
160        sequences = [
161            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
162            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
163            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
164            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
165            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
166        ]
167        for seqname, codepoints in sequences:
168            self.assertEqual(unicodedata.lookup(seqname), codepoints)
169            with self.assertRaises(SyntaxError):
170                self.checkletter(seqname, None)
171            with self.assertRaises(KeyError):
172                unicodedata.ucd_3_2_0.lookup(seqname)
173
174    def test_named_sequences_full(self):
175        # Check all the named sequences
176        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
177               unicodedata.unidata_version)
178        try:
179            testdata = support.open_urlresource(url, encoding="utf-8",
180                                                check=check_version)
181        except (OSError, HTTPException):
182            self.skipTest("Could not retrieve " + url)
183        self.addCleanup(testdata.close)
184        for line in testdata:
185            line = line.strip()
186            if not line or line.startswith('#'):
187                continue
188            seqname, codepoints = line.split(';')
189            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
190            self.assertEqual(unicodedata.lookup(seqname), codepoints)
191            with self.assertRaises(SyntaxError):
192                self.checkletter(seqname, None)
193            with self.assertRaises(KeyError):
194                unicodedata.ucd_3_2_0.lookup(seqname)
195
196    def test_errors(self):
197        self.assertRaises(TypeError, unicodedata.name)
198        self.assertRaises(TypeError, unicodedata.name, 'xx')
199        self.assertRaises(TypeError, unicodedata.lookup)
200        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
201
202    def test_strict_error_handling(self):
203        # bogus character name
204        self.assertRaises(
205            UnicodeError,
206            str, b"\\N{blah}", 'unicode-escape', 'strict'
207        )
208        # long bogus character name
209        self.assertRaises(
210            UnicodeError,
211            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
212        )
213        # missing closing brace
214        self.assertRaises(
215            UnicodeError,
216            str, b"\\N{SPACE", 'unicode-escape', 'strict'
217        )
218        # missing opening brace
219        self.assertRaises(
220            UnicodeError,
221            str, b"\\NSPACE", 'unicode-escape', 'strict'
222        )
223
224    @support.cpython_only
225    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
226    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
227    def test_issue16335(self, size):
228        # very very long bogus character name
229        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
230        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
231        self.assertRaisesRegex(UnicodeError,
232            'unknown Unicode character name',
233            x.decode, 'unicode-escape'
234        )
235
236
237if __name__ == "__main__":
238    unittest.main()
239