• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the unicodedata module.
2
3    Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import sys
10import unittest
11import hashlib
12from test.support import script_helper
13
14encoding = 'utf-8'
15errors = 'surrogatepass'
16
17
18### Run tests
19
20class UnicodeMethodsTest(unittest.TestCase):
21
22    # update this, if the database changes
23    expectedchecksum = '97a41f208c53d5e08c77c1175187e95386b82b6f'
24
25    def test_method_checksum(self):
26        h = hashlib.sha1()
27        for i in range(0x10000):
28            char = chr(i)
29            data = [
30                # Predicates (single char)
31                "01"[char.isalnum()],
32                "01"[char.isalpha()],
33                "01"[char.isdecimal()],
34                "01"[char.isdigit()],
35                "01"[char.islower()],
36                "01"[char.isnumeric()],
37                "01"[char.isspace()],
38                "01"[char.istitle()],
39                "01"[char.isupper()],
40
41                # Predicates (multiple chars)
42                "01"[(char + 'abc').isalnum()],
43                "01"[(char + 'abc').isalpha()],
44                "01"[(char + '123').isdecimal()],
45                "01"[(char + '123').isdigit()],
46                "01"[(char + 'abc').islower()],
47                "01"[(char + '123').isnumeric()],
48                "01"[(char + ' \t').isspace()],
49                "01"[(char + 'abc').istitle()],
50                "01"[(char + 'ABC').isupper()],
51
52                # Mappings (single char)
53                char.lower(),
54                char.upper(),
55                char.title(),
56
57                # Mappings (multiple chars)
58                (char + 'abc').lower(),
59                (char + 'ABC').upper(),
60                (char + 'abc').title(),
61                (char + 'ABC').title(),
62
63                ]
64            h.update(''.join(data).encode(encoding, errors))
65        result = h.hexdigest()
66        self.assertEqual(result, self.expectedchecksum)
67
68class UnicodeDatabaseTest(unittest.TestCase):
69
70    def setUp(self):
71        # In case unicodedata is not available, this will raise an ImportError,
72        # but the other test cases will still be run
73        import unicodedata
74        self.db = unicodedata
75
76    def tearDown(self):
77        del self.db
78
79class UnicodeFunctionsTest(UnicodeDatabaseTest):
80
81    # Update this if the database changes. Make sure to do a full rebuild
82    # (e.g. 'make distclean && make') to get the correct checksum.
83    expectedchecksum = '4f73278b19c2ec3099724c132f0b90a1d25c19e4'
84    def test_function_checksum(self):
85        data = []
86        h = hashlib.sha1()
87
88        for i in range(0x10000):
89            char = chr(i)
90            data = [
91                # Properties
92                format(self.db.digit(char, -1), '.12g'),
93                format(self.db.numeric(char, -1), '.12g'),
94                format(self.db.decimal(char, -1), '.12g'),
95                self.db.category(char),
96                self.db.bidirectional(char),
97                self.db.decomposition(char),
98                str(self.db.mirrored(char)),
99                str(self.db.combining(char)),
100            ]
101            h.update(''.join(data).encode("ascii"))
102        result = h.hexdigest()
103        self.assertEqual(result, self.expectedchecksum)
104
105    def test_digit(self):
106        self.assertEqual(self.db.digit('A', None), None)
107        self.assertEqual(self.db.digit('9'), 9)
108        self.assertEqual(self.db.digit('\u215b', None), None)
109        self.assertEqual(self.db.digit('\u2468'), 9)
110        self.assertEqual(self.db.digit('\U00020000', None), None)
111        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
112
113        self.assertRaises(TypeError, self.db.digit)
114        self.assertRaises(TypeError, self.db.digit, 'xx')
115        self.assertRaises(ValueError, self.db.digit, 'x')
116
117    def test_numeric(self):
118        self.assertEqual(self.db.numeric('A',None), None)
119        self.assertEqual(self.db.numeric('9'), 9)
120        self.assertEqual(self.db.numeric('\u215b'), 0.125)
121        self.assertEqual(self.db.numeric('\u2468'), 9.0)
122        self.assertEqual(self.db.numeric('\ua627'), 7.0)
123        self.assertEqual(self.db.numeric('\U00020000', None), None)
124        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
125
126        self.assertRaises(TypeError, self.db.numeric)
127        self.assertRaises(TypeError, self.db.numeric, 'xx')
128        self.assertRaises(ValueError, self.db.numeric, 'x')
129
130    def test_decimal(self):
131        self.assertEqual(self.db.decimal('A',None), None)
132        self.assertEqual(self.db.decimal('9'), 9)
133        self.assertEqual(self.db.decimal('\u215b', None), None)
134        self.assertEqual(self.db.decimal('\u2468', None), None)
135        self.assertEqual(self.db.decimal('\U00020000', None), None)
136        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
137
138        self.assertRaises(TypeError, self.db.decimal)
139        self.assertRaises(TypeError, self.db.decimal, 'xx')
140        self.assertRaises(ValueError, self.db.decimal, 'x')
141
142    def test_category(self):
143        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
144        self.assertEqual(self.db.category('a'), 'Ll')
145        self.assertEqual(self.db.category('A'), 'Lu')
146        self.assertEqual(self.db.category('\U00020000'), 'Lo')
147        self.assertEqual(self.db.category('\U0001012A'), 'No')
148
149        self.assertRaises(TypeError, self.db.category)
150        self.assertRaises(TypeError, self.db.category, 'xx')
151
152    def test_bidirectional(self):
153        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
154        self.assertEqual(self.db.bidirectional(' '), 'WS')
155        self.assertEqual(self.db.bidirectional('A'), 'L')
156        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
157
158        self.assertRaises(TypeError, self.db.bidirectional)
159        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
160
161    def test_decomposition(self):
162        self.assertEqual(self.db.decomposition('\uFFFE'),'')
163        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
164
165        self.assertRaises(TypeError, self.db.decomposition)
166        self.assertRaises(TypeError, self.db.decomposition, 'xx')
167
168    def test_mirrored(self):
169        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
170        self.assertEqual(self.db.mirrored('a'), 0)
171        self.assertEqual(self.db.mirrored('\u2201'), 1)
172        self.assertEqual(self.db.mirrored('\U00020000'), 0)
173
174        self.assertRaises(TypeError, self.db.mirrored)
175        self.assertRaises(TypeError, self.db.mirrored, 'xx')
176
177    def test_combining(self):
178        self.assertEqual(self.db.combining('\uFFFE'), 0)
179        self.assertEqual(self.db.combining('a'), 0)
180        self.assertEqual(self.db.combining('\u20e1'), 230)
181        self.assertEqual(self.db.combining('\U00020000'), 0)
182
183        self.assertRaises(TypeError, self.db.combining)
184        self.assertRaises(TypeError, self.db.combining, 'xx')
185
186    def test_normalize(self):
187        self.assertRaises(TypeError, self.db.normalize)
188        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
189        self.assertEqual(self.db.normalize('NFKC', ''), '')
190        # The rest can be found in test_normalization.py
191        # which requires an external file.
192
193    def test_pr29(self):
194        # http://www.unicode.org/review/pr-29.html
195        # See issues #1054943 and #10254.
196        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
197                    'Li\u030dt-s\u1e73\u0301',
198                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
199                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
200                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
201                    + '\u0938\u094d\u0924\u093e\u0928')
202        for text in composed:
203            self.assertEqual(self.db.normalize('NFC', text), text)
204
205    def test_issue10254(self):
206        # Crash reported in #10254
207        a = 'C\u0338' * 20  + 'C\u0327'
208        b = 'C\u0338' * 20  + '\xC7'
209        self.assertEqual(self.db.normalize('NFC', a), b)
210
211    def test_issue29456(self):
212        # Fix #29456
213        u1176_str_a = '\u1100\u1176\u11a8'
214        u1176_str_b = '\u1100\u1176\u11a8'
215        u11a7_str_a = '\u1100\u1175\u11a7'
216        u11a7_str_b = '\uae30\u11a7'
217        u11c3_str_a = '\u1100\u1175\u11c3'
218        u11c3_str_b = '\uae30\u11c3'
219        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
220        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
221        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
222
223
224    def test_east_asian_width(self):
225        eaw = self.db.east_asian_width
226        self.assertRaises(TypeError, eaw, b'a')
227        self.assertRaises(TypeError, eaw, bytearray())
228        self.assertRaises(TypeError, eaw, '')
229        self.assertRaises(TypeError, eaw, 'ra')
230        self.assertEqual(eaw('\x1e'), 'N')
231        self.assertEqual(eaw('\x20'), 'Na')
232        self.assertEqual(eaw('\uC894'), 'W')
233        self.assertEqual(eaw('\uFF66'), 'H')
234        self.assertEqual(eaw('\uFF1F'), 'F')
235        self.assertEqual(eaw('\u2010'), 'A')
236        self.assertEqual(eaw('\U00020000'), 'W')
237
238    def test_east_asian_width_9_0_changes(self):
239        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
240        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
241
242class UnicodeMiscTest(UnicodeDatabaseTest):
243
244    def test_failed_import_during_compiling(self):
245        # Issue 4367
246        # Decoding \N escapes requires the unicodedata module. If it can't be
247        # imported, we shouldn't segfault.
248
249        # This program should raise a SyntaxError in the eval.
250        code = "import sys;" \
251            "sys.modules['unicodedata'] = None;" \
252            """eval("'\\\\N{SOFT HYPHEN}'")"""
253        # We use a separate process because the unicodedata module may already
254        # have been loaded in this process.
255        result = script_helper.assert_python_failure("-c", code)
256        error = "SyntaxError: (unicode error) \\N escapes not supported " \
257            "(can't load unicodedata module)"
258        self.assertIn(error, result.err.decode("ascii"))
259
260    def test_decimal_numeric_consistent(self):
261        # Test that decimal and numeric are consistent,
262        # i.e. if a character has a decimal value,
263        # its numeric value should be the same.
264        count = 0
265        for i in range(0x10000):
266            c = chr(i)
267            dec = self.db.decimal(c, -1)
268            if dec != -1:
269                self.assertEqual(dec, self.db.numeric(c))
270                count += 1
271        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
272
273    def test_digit_numeric_consistent(self):
274        # Test that digit and numeric are consistent,
275        # i.e. if a character has a digit value,
276        # its numeric value should be the same.
277        count = 0
278        for i in range(0x10000):
279            c = chr(i)
280            dec = self.db.digit(c, -1)
281            if dec != -1:
282                self.assertEqual(dec, self.db.numeric(c))
283                count += 1
284        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
285
286    def test_bug_1704793(self):
287        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
288
289    def test_ucd_510(self):
290        import unicodedata
291        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
292        self.assertTrue(unicodedata.mirrored("\u0f3a"))
293        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
294        # Also, we now have two ways of representing
295        # the upper-case mapping: as delta, or as absolute value
296        self.assertTrue("a".upper()=='A')
297        self.assertTrue("\u1d79".upper()=='\ua77d')
298        self.assertTrue(".".upper()=='.')
299
300    def test_bug_5828(self):
301        self.assertEqual("\u1d79".lower(), "\u1d79")
302        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
303        self.assertEqual(
304            [
305                c for c in range(sys.maxunicode+1)
306                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
307            ],
308            [0]
309        )
310
311    def test_bug_4971(self):
312        # LETTER DZ WITH CARON: DZ, Dz, dz
313        self.assertEqual("\u01c4".title(), "\u01c5")
314        self.assertEqual("\u01c5".title(), "\u01c5")
315        self.assertEqual("\u01c6".title(), "\u01c5")
316
317    def test_linebreak_7643(self):
318        for i in range(0x10000):
319            lines = (chr(i) + 'A').splitlines()
320            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
321                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
322                self.assertEqual(len(lines), 2,
323                                 r"\u%.4x should be a linebreak" % i)
324            else:
325                self.assertEqual(len(lines), 1,
326                                 r"\u%.4x should not be a linebreak" % i)
327
328if __name__ == "__main__":
329    unittest.main()
330