• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import unittest
11import unicodedata
12
13from test import support
14from http.client import HTTPException
15from test.test_normalization import check_version
16
17try:
18    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
19except ImportError:
20    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
21
22class UnicodeNamesTest(unittest.TestCase):
23
24    def checkletter(self, name, code):
25        # Helper that put all \N escapes inside eval'd raw strings,
26        # to make sure this script runs even if the compiler
27        # chokes on \N escapes
28        res = eval(r'"\N{%s}"' % name)
29        self.assertEqual(res, code)
30        return res
31
32    def test_general(self):
33        # General and case insensitivity test:
34        chars = [
35            "LATIN CAPITAL LETTER T",
36            "LATIN SMALL LETTER H",
37            "LATIN SMALL LETTER E",
38            "SPACE",
39            "LATIN SMALL LETTER R",
40            "LATIN CAPITAL LETTER E",
41            "LATIN SMALL LETTER D",
42            "SPACE",
43            "LATIN SMALL LETTER f",
44            "LATIN CAPITAL LeTtEr o",
45            "LATIN SMaLl LETTER x",
46            "SPACE",
47            "LATIN SMALL LETTER A",
48            "LATIN SMALL LETTER T",
49            "LATIN SMALL LETTER E",
50            "SPACE",
51            "LATIN SMALL LETTER T",
52            "LATIN SMALL LETTER H",
53            "LATIN SMALL LETTER E",
54            "SpAcE",
55            "LATIN SMALL LETTER S",
56            "LATIN SMALL LETTER H",
57            "LATIN small LETTER e",
58            "LATIN small LETTER e",
59            "LATIN SMALL LETTER P",
60            "FULL STOP"
61        ]
62        string = "The rEd fOx ate the sheep."
63
64        self.assertEqual(
65            "".join([self.checkletter(*args) for args in zip(chars, string)]),
66            string
67        )
68
69    def test_ascii_letters(self):
70        for char in "".join(map(chr, range(ord("a"), ord("z")))):
71            name = "LATIN SMALL LETTER %s" % char.upper()
72            code = unicodedata.lookup(name)
73            self.assertEqual(unicodedata.name(code), name)
74
75    def test_hangul_syllables(self):
76        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
77        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
78        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
79        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
80        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
81        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
82        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
83        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
84        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
85        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
86        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
87        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
88        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
89
90        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
91
92    def test_cjk_unified_ideographs(self):
93        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
94        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
95        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
96        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
97        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
98        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
99        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
100        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
101        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
102        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
103
104    def test_bmp_characters(self):
105        for code in range(0x10000):
106            char = chr(code)
107            name = unicodedata.name(char, None)
108            if name is not None:
109                self.assertEqual(unicodedata.lookup(name), char)
110
111    def test_misc_symbols(self):
112        self.checkletter("PILCROW SIGN", "\u00b6")
113        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
114        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
115        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
116
117    def test_aliases(self):
118        # Check that the aliases defined in the NameAliases.txt file work.
119        # This should be updated when new aliases are added or the file
120        # should be downloaded and parsed instead.  See #12753.
121        aliases = [
122            ('LATIN CAPITAL LETTER GHA', 0x01A2),
123            ('LATIN SMALL LETTER GHA', 0x01A3),
124            ('KANNADA LETTER LLLA', 0x0CDE),
125            ('LAO LETTER FO FON', 0x0E9D),
126            ('LAO LETTER FO FAY', 0x0E9F),
127            ('LAO LETTER RO', 0x0EA3),
128            ('LAO LETTER LO', 0x0EA5),
129            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
130            ('YI SYLLABLE ITERATION MARK', 0xA015),
131            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
132            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
133        ]
134        for alias, codepoint in aliases:
135            self.checkletter(alias, chr(codepoint))
136            name = unicodedata.name(chr(codepoint))
137            self.assertNotEqual(name, alias)
138            self.assertEqual(unicodedata.lookup(alias),
139                             unicodedata.lookup(name))
140            with self.assertRaises(KeyError):
141                unicodedata.ucd_3_2_0.lookup(alias)
142
143    def test_aliases_names_in_pua_range(self):
144        # We are storing aliases in the PUA 15, but their names shouldn't leak
145        for cp in range(0xf0000, 0xf0100):
146            with self.assertRaises(ValueError) as cm:
147                unicodedata.name(chr(cp))
148            self.assertEqual(str(cm.exception), 'no such name')
149
150    def test_named_sequences_names_in_pua_range(self):
151        # We are storing named seq in the PUA 15, but their names shouldn't leak
152        for cp in range(0xf0100, 0xf0fff):
153            with self.assertRaises(ValueError) as cm:
154                unicodedata.name(chr(cp))
155            self.assertEqual(str(cm.exception), 'no such name')
156
157    def test_named_sequences_sample(self):
158        # Check a few named sequences.  See #12753.
159        sequences = [
160            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
161            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
162            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
163            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
164            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
165        ]
166        for seqname, codepoints in sequences:
167            self.assertEqual(unicodedata.lookup(seqname), codepoints)
168            with self.assertRaises(SyntaxError):
169                self.checkletter(seqname, None)
170            with self.assertRaises(KeyError):
171                unicodedata.ucd_3_2_0.lookup(seqname)
172
173    def test_named_sequences_full(self):
174        # Check all the named sequences
175        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
176               unicodedata.unidata_version)
177        try:
178            testdata = support.open_urlresource(url, encoding="utf-8",
179                                                check=check_version)
180        except (OSError, HTTPException):
181            self.skipTest("Could not retrieve " + url)
182        self.addCleanup(testdata.close)
183        for line in testdata:
184            line = line.strip()
185            if not line or line.startswith('#'):
186                continue
187            seqname, codepoints = line.split(';')
188            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
189            self.assertEqual(unicodedata.lookup(seqname), codepoints)
190            with self.assertRaises(SyntaxError):
191                self.checkletter(seqname, None)
192            with self.assertRaises(KeyError):
193                unicodedata.ucd_3_2_0.lookup(seqname)
194
195    def test_errors(self):
196        self.assertRaises(TypeError, unicodedata.name)
197        self.assertRaises(TypeError, unicodedata.name, 'xx')
198        self.assertRaises(TypeError, unicodedata.lookup)
199        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
200
201    def test_strict_error_handling(self):
202        # bogus character name
203        self.assertRaises(
204            UnicodeError,
205            str, b"\\N{blah}", 'unicode-escape', 'strict'
206        )
207        # long bogus character name
208        self.assertRaises(
209            UnicodeError,
210            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
211        )
212        # missing closing brace
213        self.assertRaises(
214            UnicodeError,
215            str, b"\\N{SPACE", 'unicode-escape', 'strict'
216        )
217        # missing opening brace
218        self.assertRaises(
219            UnicodeError,
220            str, b"\\NSPACE", 'unicode-escape', 'strict'
221        )
222
223    @support.cpython_only
224    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
225    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
226    def test_issue16335(self, size):
227        # very very long bogus character name
228        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
229        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
230        self.assertRaisesRegex(UnicodeError,
231            'unknown Unicode character name',
232            x.decode, 'unicode-escape'
233        )
234
235
236if __name__ == "__main__":
237    unittest.main()
238