• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the unicodedata module.
2
3    Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import sys
10import unittest
11import hashlib
12from test.support import script_helper
13
14encoding = 'utf-8'
15errors = 'surrogatepass'
16
17
18### Run tests
19
20class UnicodeMethodsTest(unittest.TestCase):
21
22    # update this, if the database changes
23    expectedchecksum = 'c1fa98674a683aa8a8d8dee0c84494f8d36346e6'
24
25    def test_method_checksum(self):
26        h = hashlib.sha1()
27        for i in range(0x10000):
28            char = chr(i)
29            data = [
30                # Predicates (single char)
31                "01"[char.isalnum()],
32                "01"[char.isalpha()],
33                "01"[char.isdecimal()],
34                "01"[char.isdigit()],
35                "01"[char.islower()],
36                "01"[char.isnumeric()],
37                "01"[char.isspace()],
38                "01"[char.istitle()],
39                "01"[char.isupper()],
40
41                # Predicates (multiple chars)
42                "01"[(char + 'abc').isalnum()],
43                "01"[(char + 'abc').isalpha()],
44                "01"[(char + '123').isdecimal()],
45                "01"[(char + '123').isdigit()],
46                "01"[(char + 'abc').islower()],
47                "01"[(char + '123').isnumeric()],
48                "01"[(char + ' \t').isspace()],
49                "01"[(char + 'abc').istitle()],
50                "01"[(char + 'ABC').isupper()],
51
52                # Mappings (single char)
53                char.lower(),
54                char.upper(),
55                char.title(),
56
57                # Mappings (multiple chars)
58                (char + 'abc').lower(),
59                (char + 'ABC').upper(),
60                (char + 'abc').title(),
61                (char + 'ABC').title(),
62
63                ]
64            h.update(''.join(data).encode(encoding, errors))
65        result = h.hexdigest()
66        self.assertEqual(result, self.expectedchecksum)
67
68class UnicodeDatabaseTest(unittest.TestCase):
69
70    def setUp(self):
71        # In case unicodedata is not available, this will raise an ImportError,
72        # but the other test cases will still be run
73        import unicodedata
74        self.db = unicodedata
75
76    def tearDown(self):
77        del self.db
78
79class UnicodeFunctionsTest(UnicodeDatabaseTest):
80
81    # Update this if the database changes. Make sure to do a full rebuild
82    # (e.g. 'make distclean && make') to get the correct checksum.
83    expectedchecksum = 'f891b1e6430c712531b9bc935a38e22d78ba1bf3'
84    def test_function_checksum(self):
85        data = []
86        h = hashlib.sha1()
87
88        for i in range(0x10000):
89            char = chr(i)
90            data = [
91                # Properties
92                format(self.db.digit(char, -1), '.12g'),
93                format(self.db.numeric(char, -1), '.12g'),
94                format(self.db.decimal(char, -1), '.12g'),
95                self.db.category(char),
96                self.db.bidirectional(char),
97                self.db.decomposition(char),
98                str(self.db.mirrored(char)),
99                str(self.db.combining(char)),
100            ]
101            h.update(''.join(data).encode("ascii"))
102        result = h.hexdigest()
103        self.assertEqual(result, self.expectedchecksum)
104
105    def test_digit(self):
106        self.assertEqual(self.db.digit('A', None), None)
107        self.assertEqual(self.db.digit('9'), 9)
108        self.assertEqual(self.db.digit('\u215b', None), None)
109        self.assertEqual(self.db.digit('\u2468'), 9)
110        self.assertEqual(self.db.digit('\U00020000', None), None)
111        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
112
113        self.assertRaises(TypeError, self.db.digit)
114        self.assertRaises(TypeError, self.db.digit, 'xx')
115        self.assertRaises(ValueError, self.db.digit, 'x')
116
117    def test_numeric(self):
118        self.assertEqual(self.db.numeric('A',None), None)
119        self.assertEqual(self.db.numeric('9'), 9)
120        self.assertEqual(self.db.numeric('\u215b'), 0.125)
121        self.assertEqual(self.db.numeric('\u2468'), 9.0)
122        self.assertEqual(self.db.numeric('\ua627'), 7.0)
123        self.assertEqual(self.db.numeric('\U00020000', None), None)
124        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
125
126        self.assertRaises(TypeError, self.db.numeric)
127        self.assertRaises(TypeError, self.db.numeric, 'xx')
128        self.assertRaises(ValueError, self.db.numeric, 'x')
129
130    def test_decimal(self):
131        self.assertEqual(self.db.decimal('A',None), None)
132        self.assertEqual(self.db.decimal('9'), 9)
133        self.assertEqual(self.db.decimal('\u215b', None), None)
134        self.assertEqual(self.db.decimal('\u2468', None), None)
135        self.assertEqual(self.db.decimal('\U00020000', None), None)
136        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
137
138        self.assertRaises(TypeError, self.db.decimal)
139        self.assertRaises(TypeError, self.db.decimal, 'xx')
140        self.assertRaises(ValueError, self.db.decimal, 'x')
141
142    def test_category(self):
143        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
144        self.assertEqual(self.db.category('a'), 'Ll')
145        self.assertEqual(self.db.category('A'), 'Lu')
146        self.assertEqual(self.db.category('\U00020000'), 'Lo')
147        self.assertEqual(self.db.category('\U0001012A'), 'No')
148
149        self.assertRaises(TypeError, self.db.category)
150        self.assertRaises(TypeError, self.db.category, 'xx')
151
152    def test_bidirectional(self):
153        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
154        self.assertEqual(self.db.bidirectional(' '), 'WS')
155        self.assertEqual(self.db.bidirectional('A'), 'L')
156        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
157
158        self.assertRaises(TypeError, self.db.bidirectional)
159        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
160
161    def test_decomposition(self):
162        self.assertEqual(self.db.decomposition('\uFFFE'),'')
163        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
164
165        self.assertRaises(TypeError, self.db.decomposition)
166        self.assertRaises(TypeError, self.db.decomposition, 'xx')
167
168    def test_mirrored(self):
169        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
170        self.assertEqual(self.db.mirrored('a'), 0)
171        self.assertEqual(self.db.mirrored('\u2201'), 1)
172        self.assertEqual(self.db.mirrored('\U00020000'), 0)
173
174        self.assertRaises(TypeError, self.db.mirrored)
175        self.assertRaises(TypeError, self.db.mirrored, 'xx')
176
177    def test_combining(self):
178        self.assertEqual(self.db.combining('\uFFFE'), 0)
179        self.assertEqual(self.db.combining('a'), 0)
180        self.assertEqual(self.db.combining('\u20e1'), 230)
181        self.assertEqual(self.db.combining('\U00020000'), 0)
182
183        self.assertRaises(TypeError, self.db.combining)
184        self.assertRaises(TypeError, self.db.combining, 'xx')
185
186    def test_normalize(self):
187        self.assertRaises(TypeError, self.db.normalize)
188        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
189        self.assertEqual(self.db.normalize('NFKC', ''), '')
190        # The rest can be found in test_normalization.py
191        # which requires an external file.
192
193    def test_pr29(self):
194        # http://www.unicode.org/review/pr-29.html
195        # See issues #1054943 and #10254.
196        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
197                    'Li\u030dt-s\u1e73\u0301',
198                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
199                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
200                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
201                    + '\u0938\u094d\u0924\u093e\u0928')
202        for text in composed:
203            self.assertEqual(self.db.normalize('NFC', text), text)
204
205    def test_issue10254(self):
206        # Crash reported in #10254
207        a = 'C\u0338' * 20  + 'C\u0327'
208        b = 'C\u0338' * 20  + '\xC7'
209        self.assertEqual(self.db.normalize('NFC', a), b)
210
211    def test_east_asian_width(self):
212        eaw = self.db.east_asian_width
213        self.assertRaises(TypeError, eaw, b'a')
214        self.assertRaises(TypeError, eaw, bytearray())
215        self.assertRaises(TypeError, eaw, '')
216        self.assertRaises(TypeError, eaw, 'ra')
217        self.assertEqual(eaw('\x1e'), 'N')
218        self.assertEqual(eaw('\x20'), 'Na')
219        self.assertEqual(eaw('\uC894'), 'W')
220        self.assertEqual(eaw('\uFF66'), 'H')
221        self.assertEqual(eaw('\uFF1F'), 'F')
222        self.assertEqual(eaw('\u2010'), 'A')
223        self.assertEqual(eaw('\U00020000'), 'W')
224
225    def test_east_asian_width_9_0_changes(self):
226        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
227        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
228
229class UnicodeMiscTest(UnicodeDatabaseTest):
230
231    def test_failed_import_during_compiling(self):
232        # Issue 4367
233        # Decoding \N escapes requires the unicodedata module. If it can't be
234        # imported, we shouldn't segfault.
235
236        # This program should raise a SyntaxError in the eval.
237        code = "import sys;" \
238            "sys.modules['unicodedata'] = None;" \
239            """eval("'\\\\N{SOFT HYPHEN}'")"""
240        # We use a separate process because the unicodedata module may already
241        # have been loaded in this process.
242        result = script_helper.assert_python_failure("-c", code)
243        error = "SyntaxError: (unicode error) \\N escapes not supported " \
244            "(can't load unicodedata module)"
245        self.assertIn(error, result.err.decode("ascii"))
246
247    def test_decimal_numeric_consistent(self):
248        # Test that decimal and numeric are consistent,
249        # i.e. if a character has a decimal value,
250        # its numeric value should be the same.
251        count = 0
252        for i in range(0x10000):
253            c = chr(i)
254            dec = self.db.decimal(c, -1)
255            if dec != -1:
256                self.assertEqual(dec, self.db.numeric(c))
257                count += 1
258        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
259
260    def test_digit_numeric_consistent(self):
261        # Test that digit and numeric are consistent,
262        # i.e. if a character has a digit value,
263        # its numeric value should be the same.
264        count = 0
265        for i in range(0x10000):
266            c = chr(i)
267            dec = self.db.digit(c, -1)
268            if dec != -1:
269                self.assertEqual(dec, self.db.numeric(c))
270                count += 1
271        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
272
273    def test_bug_1704793(self):
274        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
275
276    def test_ucd_510(self):
277        import unicodedata
278        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
279        self.assertTrue(unicodedata.mirrored("\u0f3a"))
280        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
281        # Also, we now have two ways of representing
282        # the upper-case mapping: as delta, or as absolute value
283        self.assertTrue("a".upper()=='A')
284        self.assertTrue("\u1d79".upper()=='\ua77d')
285        self.assertTrue(".".upper()=='.')
286
287    def test_bug_5828(self):
288        self.assertEqual("\u1d79".lower(), "\u1d79")
289        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
290        self.assertEqual(
291            [
292                c for c in range(sys.maxunicode+1)
293                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
294            ],
295            [0]
296        )
297
298    def test_bug_4971(self):
299        # LETTER DZ WITH CARON: DZ, Dz, dz
300        self.assertEqual("\u01c4".title(), "\u01c5")
301        self.assertEqual("\u01c5".title(), "\u01c5")
302        self.assertEqual("\u01c6".title(), "\u01c5")
303
304    def test_linebreak_7643(self):
305        for i in range(0x10000):
306            lines = (chr(i) + 'A').splitlines()
307            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
308                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
309                self.assertEqual(len(lines), 2,
310                                 r"\u%.4x should be a linebreak" % i)
311            else:
312                self.assertEqual(len(lines), 1,
313                                 r"\u%.4x should not be a linebreak" % i)
314
315if __name__ == "__main__":
316    unittest.main()
317