• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the unicodedata module.
2
3    Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import sys
10import unittest
11import hashlib
12from test.support import script_helper
13
14encoding = 'utf-8'
15errors = 'surrogatepass'
16
17
18### Run tests
19
20class UnicodeMethodsTest(unittest.TestCase):
21
22    # update this, if the database changes
23    expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1'
24
25    def test_method_checksum(self):
26        h = hashlib.sha1()
27        for i in range(0x10000):
28            char = chr(i)
29            data = [
30                # Predicates (single char)
31                "01"[char.isalnum()],
32                "01"[char.isalpha()],
33                "01"[char.isdecimal()],
34                "01"[char.isdigit()],
35                "01"[char.islower()],
36                "01"[char.isnumeric()],
37                "01"[char.isspace()],
38                "01"[char.istitle()],
39                "01"[char.isupper()],
40
41                # Predicates (multiple chars)
42                "01"[(char + 'abc').isalnum()],
43                "01"[(char + 'abc').isalpha()],
44                "01"[(char + '123').isdecimal()],
45                "01"[(char + '123').isdigit()],
46                "01"[(char + 'abc').islower()],
47                "01"[(char + '123').isnumeric()],
48                "01"[(char + ' \t').isspace()],
49                "01"[(char + 'abc').istitle()],
50                "01"[(char + 'ABC').isupper()],
51
52                # Mappings (single char)
53                char.lower(),
54                char.upper(),
55                char.title(),
56
57                # Mappings (multiple chars)
58                (char + 'abc').lower(),
59                (char + 'ABC').upper(),
60                (char + 'abc').title(),
61                (char + 'ABC').title(),
62
63                ]
64            h.update(''.join(data).encode(encoding, errors))
65        result = h.hexdigest()
66        self.assertEqual(result, self.expectedchecksum)
67
68class UnicodeDatabaseTest(unittest.TestCase):
69
70    def setUp(self):
71        # In case unicodedata is not available, this will raise an ImportError,
72        # but the other test cases will still be run
73        import unicodedata
74        self.db = unicodedata
75
76    def tearDown(self):
77        del self.db
78
79class UnicodeFunctionsTest(UnicodeDatabaseTest):
80
81    # Update this if the database changes. Make sure to do a full rebuild
82    # (e.g. 'make distclean && make') to get the correct checksum.
83    expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652'
84    def test_function_checksum(self):
85        data = []
86        h = hashlib.sha1()
87
88        for i in range(0x10000):
89            char = chr(i)
90            data = [
91                # Properties
92                format(self.db.digit(char, -1), '.12g'),
93                format(self.db.numeric(char, -1), '.12g'),
94                format(self.db.decimal(char, -1), '.12g'),
95                self.db.category(char),
96                self.db.bidirectional(char),
97                self.db.decomposition(char),
98                str(self.db.mirrored(char)),
99                str(self.db.combining(char)),
100            ]
101            h.update(''.join(data).encode("ascii"))
102        result = h.hexdigest()
103        self.assertEqual(result, self.expectedchecksum)
104
105    def test_digit(self):
106        self.assertEqual(self.db.digit('A', None), None)
107        self.assertEqual(self.db.digit('9'), 9)
108        self.assertEqual(self.db.digit('\u215b', None), None)
109        self.assertEqual(self.db.digit('\u2468'), 9)
110        self.assertEqual(self.db.digit('\U00020000', None), None)
111        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
112
113        self.assertRaises(TypeError, self.db.digit)
114        self.assertRaises(TypeError, self.db.digit, 'xx')
115        self.assertRaises(ValueError, self.db.digit, 'x')
116
117    def test_numeric(self):
118        self.assertEqual(self.db.numeric('A',None), None)
119        self.assertEqual(self.db.numeric('9'), 9)
120        self.assertEqual(self.db.numeric('\u215b'), 0.125)
121        self.assertEqual(self.db.numeric('\u2468'), 9.0)
122        self.assertEqual(self.db.numeric('\ua627'), 7.0)
123        self.assertEqual(self.db.numeric('\U00020000', None), None)
124        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
125
126        self.assertRaises(TypeError, self.db.numeric)
127        self.assertRaises(TypeError, self.db.numeric, 'xx')
128        self.assertRaises(ValueError, self.db.numeric, 'x')
129
130    def test_decimal(self):
131        self.assertEqual(self.db.decimal('A',None), None)
132        self.assertEqual(self.db.decimal('9'), 9)
133        self.assertEqual(self.db.decimal('\u215b', None), None)
134        self.assertEqual(self.db.decimal('\u2468', None), None)
135        self.assertEqual(self.db.decimal('\U00020000', None), None)
136        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
137
138        self.assertRaises(TypeError, self.db.decimal)
139        self.assertRaises(TypeError, self.db.decimal, 'xx')
140        self.assertRaises(ValueError, self.db.decimal, 'x')
141
142    def test_category(self):
143        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
144        self.assertEqual(self.db.category('a'), 'Ll')
145        self.assertEqual(self.db.category('A'), 'Lu')
146        self.assertEqual(self.db.category('\U00020000'), 'Lo')
147        self.assertEqual(self.db.category('\U0001012A'), 'No')
148
149        self.assertRaises(TypeError, self.db.category)
150        self.assertRaises(TypeError, self.db.category, 'xx')
151
152    def test_bidirectional(self):
153        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
154        self.assertEqual(self.db.bidirectional(' '), 'WS')
155        self.assertEqual(self.db.bidirectional('A'), 'L')
156        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
157
158        self.assertRaises(TypeError, self.db.bidirectional)
159        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
160
161    def test_decomposition(self):
162        self.assertEqual(self.db.decomposition('\uFFFE'),'')
163        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
164
165        self.assertRaises(TypeError, self.db.decomposition)
166        self.assertRaises(TypeError, self.db.decomposition, 'xx')
167
168    def test_mirrored(self):
169        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
170        self.assertEqual(self.db.mirrored('a'), 0)
171        self.assertEqual(self.db.mirrored('\u2201'), 1)
172        self.assertEqual(self.db.mirrored('\U00020000'), 0)
173
174        self.assertRaises(TypeError, self.db.mirrored)
175        self.assertRaises(TypeError, self.db.mirrored, 'xx')
176
177    def test_combining(self):
178        self.assertEqual(self.db.combining('\uFFFE'), 0)
179        self.assertEqual(self.db.combining('a'), 0)
180        self.assertEqual(self.db.combining('\u20e1'), 230)
181        self.assertEqual(self.db.combining('\U00020000'), 0)
182
183        self.assertRaises(TypeError, self.db.combining)
184        self.assertRaises(TypeError, self.db.combining, 'xx')
185
186    def test_normalize(self):
187        self.assertRaises(TypeError, self.db.normalize)
188        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
189        self.assertEqual(self.db.normalize('NFKC', ''), '')
190        # The rest can be found in test_normalization.py
191        # which requires an external file.
192
193    def test_pr29(self):
194        # http://www.unicode.org/review/pr-29.html
195        # See issues #1054943 and #10254.
196        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
197                    'Li\u030dt-s\u1e73\u0301',
198                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
199                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
200                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
201                    + '\u0938\u094d\u0924\u093e\u0928')
202        for text in composed:
203            self.assertEqual(self.db.normalize('NFC', text), text)
204
205    def test_issue10254(self):
206        # Crash reported in #10254
207        a = 'C\u0338' * 20  + 'C\u0327'
208        b = 'C\u0338' * 20  + '\xC7'
209        self.assertEqual(self.db.normalize('NFC', a), b)
210
211    def test_issue29456(self):
212        # Fix #29456
213        u1176_str_a = '\u1100\u1176\u11a8'
214        u1176_str_b = '\u1100\u1176\u11a8'
215        u11a7_str_a = '\u1100\u1175\u11a7'
216        u11a7_str_b = '\uae30\u11a7'
217        u11c3_str_a = '\u1100\u1175\u11c3'
218        u11c3_str_b = '\uae30\u11c3'
219        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
220        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
221        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
222
223    # For tests of unicodedata.is_normalized / self.db.is_normalized ,
224    # see test_normalization.py .
225
226    def test_east_asian_width(self):
227        eaw = self.db.east_asian_width
228        self.assertRaises(TypeError, eaw, b'a')
229        self.assertRaises(TypeError, eaw, bytearray())
230        self.assertRaises(TypeError, eaw, '')
231        self.assertRaises(TypeError, eaw, 'ra')
232        self.assertEqual(eaw('\x1e'), 'N')
233        self.assertEqual(eaw('\x20'), 'Na')
234        self.assertEqual(eaw('\uC894'), 'W')
235        self.assertEqual(eaw('\uFF66'), 'H')
236        self.assertEqual(eaw('\uFF1F'), 'F')
237        self.assertEqual(eaw('\u2010'), 'A')
238        self.assertEqual(eaw('\U00020000'), 'W')
239
240    def test_east_asian_width_9_0_changes(self):
241        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
242        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
243
244class UnicodeMiscTest(UnicodeDatabaseTest):
245
246    def test_failed_import_during_compiling(self):
247        # Issue 4367
248        # Decoding \N escapes requires the unicodedata module. If it can't be
249        # imported, we shouldn't segfault.
250
251        # This program should raise a SyntaxError in the eval.
252        code = "import sys;" \
253            "sys.modules['unicodedata'] = None;" \
254            """eval("'\\\\N{SOFT HYPHEN}'")"""
255        # We use a separate process because the unicodedata module may already
256        # have been loaded in this process.
257        result = script_helper.assert_python_failure("-c", code)
258        error = "SyntaxError: (unicode error) \\N escapes not supported " \
259            "(can't load unicodedata module)"
260        self.assertIn(error, result.err.decode("ascii"))
261
262    def test_decimal_numeric_consistent(self):
263        # Test that decimal and numeric are consistent,
264        # i.e. if a character has a decimal value,
265        # its numeric value should be the same.
266        count = 0
267        for i in range(0x10000):
268            c = chr(i)
269            dec = self.db.decimal(c, -1)
270            if dec != -1:
271                self.assertEqual(dec, self.db.numeric(c))
272                count += 1
273        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
274
275    def test_digit_numeric_consistent(self):
276        # Test that digit and numeric are consistent,
277        # i.e. if a character has a digit value,
278        # its numeric value should be the same.
279        count = 0
280        for i in range(0x10000):
281            c = chr(i)
282            dec = self.db.digit(c, -1)
283            if dec != -1:
284                self.assertEqual(dec, self.db.numeric(c))
285                count += 1
286        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
287
288    def test_bug_1704793(self):
289        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
290
291    def test_ucd_510(self):
292        import unicodedata
293        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
294        self.assertTrue(unicodedata.mirrored("\u0f3a"))
295        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
296        # Also, we now have two ways of representing
297        # the upper-case mapping: as delta, or as absolute value
298        self.assertTrue("a".upper()=='A')
299        self.assertTrue("\u1d79".upper()=='\ua77d')
300        self.assertTrue(".".upper()=='.')
301
302    def test_bug_5828(self):
303        self.assertEqual("\u1d79".lower(), "\u1d79")
304        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
305        self.assertEqual(
306            [
307                c for c in range(sys.maxunicode+1)
308                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
309            ],
310            [0]
311        )
312
313    def test_bug_4971(self):
314        # LETTER DZ WITH CARON: DZ, Dz, dz
315        self.assertEqual("\u01c4".title(), "\u01c5")
316        self.assertEqual("\u01c5".title(), "\u01c5")
317        self.assertEqual("\u01c6".title(), "\u01c5")
318
319    def test_linebreak_7643(self):
320        for i in range(0x10000):
321            lines = (chr(i) + 'A').splitlines()
322            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
323                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
324                self.assertEqual(len(lines), 2,
325                                 r"\u%.4x should be a linebreak" % i)
326            else:
327                self.assertEqual(len(lines), 1,
328                                 r"\u%.4x should not be a linebreak" % i)
329
330if __name__ == "__main__":
331    unittest.main()
332