• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Tests for the unicodedata module.
2
3    Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import hashlib
10from http.client import HTTPException
11import sys
12import unicodedata
13import unittest
14from test.support import (open_urlresource, requires_resource, script_helper,
15                          cpython_only, check_disallow_instantiation)
16
17
18class UnicodeMethodsTest(unittest.TestCase):
19
20    # update this, if the database changes
21    expectedchecksum = 'fbdf8106a3c7c242086b0a9efa03ad4d30d5b85d'
22
23    @requires_resource('cpu')
24    def test_method_checksum(self):
25        h = hashlib.sha1()
26        for i in range(sys.maxunicode + 1):
27            char = chr(i)
28            data = [
29                # Predicates (single char)
30                "01"[char.isalnum()],
31                "01"[char.isalpha()],
32                "01"[char.isdecimal()],
33                "01"[char.isdigit()],
34                "01"[char.islower()],
35                "01"[char.isnumeric()],
36                "01"[char.isspace()],
37                "01"[char.istitle()],
38                "01"[char.isupper()],
39
40                # Predicates (multiple chars)
41                "01"[(char + 'abc').isalnum()],
42                "01"[(char + 'abc').isalpha()],
43                "01"[(char + '123').isdecimal()],
44                "01"[(char + '123').isdigit()],
45                "01"[(char + 'abc').islower()],
46                "01"[(char + '123').isnumeric()],
47                "01"[(char + ' \t').isspace()],
48                "01"[(char + 'abc').istitle()],
49                "01"[(char + 'ABC').isupper()],
50
51                # Mappings (single char)
52                char.lower(),
53                char.upper(),
54                char.title(),
55
56                # Mappings (multiple chars)
57                (char + 'abc').lower(),
58                (char + 'ABC').upper(),
59                (char + 'abc').title(),
60                (char + 'ABC').title(),
61
62                ]
63            h.update(''.join(data).encode('utf-8', 'surrogatepass'))
64        result = h.hexdigest()
65        self.assertEqual(result, self.expectedchecksum)
66
67class UnicodeDatabaseTest(unittest.TestCase):
68    db = unicodedata
69
70class UnicodeFunctionsTest(UnicodeDatabaseTest):
71
72    # Update this if the database changes. Make sure to do a full rebuild
73    # (e.g. 'make distclean && make') to get the correct checksum.
74    expectedchecksum = 'd1e37a2854df60ac607b47b51189b9bf1b54bfdb'
75
76    @requires_resource('cpu')
77    def test_function_checksum(self):
78        data = []
79        h = hashlib.sha1()
80
81        for i in range(sys.maxunicode + 1):
82            char = chr(i)
83            data = [
84                # Properties
85                format(self.db.digit(char, -1), '.12g'),
86                format(self.db.numeric(char, -1), '.12g'),
87                format(self.db.decimal(char, -1), '.12g'),
88                self.db.category(char),
89                self.db.bidirectional(char),
90                self.db.decomposition(char),
91                str(self.db.mirrored(char)),
92                str(self.db.combining(char)),
93            ]
94            h.update(''.join(data).encode("ascii"))
95        result = h.hexdigest()
96        self.assertEqual(result, self.expectedchecksum)
97
98    def test_digit(self):
99        self.assertEqual(self.db.digit('A', None), None)
100        self.assertEqual(self.db.digit('9'), 9)
101        self.assertEqual(self.db.digit('\u215b', None), None)
102        self.assertEqual(self.db.digit('\u2468'), 9)
103        self.assertEqual(self.db.digit('\U00020000', None), None)
104        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
105
106        self.assertRaises(TypeError, self.db.digit)
107        self.assertRaises(TypeError, self.db.digit, 'xx')
108        self.assertRaises(ValueError, self.db.digit, 'x')
109
110    def test_numeric(self):
111        self.assertEqual(self.db.numeric('A',None), None)
112        self.assertEqual(self.db.numeric('9'), 9)
113        self.assertEqual(self.db.numeric('\u215b'), 0.125)
114        self.assertEqual(self.db.numeric('\u2468'), 9.0)
115        self.assertEqual(self.db.numeric('\ua627'), 7.0)
116        self.assertEqual(self.db.numeric('\U00020000', None), None)
117        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
118
119        self.assertRaises(TypeError, self.db.numeric)
120        self.assertRaises(TypeError, self.db.numeric, 'xx')
121        self.assertRaises(ValueError, self.db.numeric, 'x')
122
123    def test_decimal(self):
124        self.assertEqual(self.db.decimal('A',None), None)
125        self.assertEqual(self.db.decimal('9'), 9)
126        self.assertEqual(self.db.decimal('\u215b', None), None)
127        self.assertEqual(self.db.decimal('\u2468', None), None)
128        self.assertEqual(self.db.decimal('\U00020000', None), None)
129        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
130
131        self.assertRaises(TypeError, self.db.decimal)
132        self.assertRaises(TypeError, self.db.decimal, 'xx')
133        self.assertRaises(ValueError, self.db.decimal, 'x')
134
135    def test_category(self):
136        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
137        self.assertEqual(self.db.category('a'), 'Ll')
138        self.assertEqual(self.db.category('A'), 'Lu')
139        self.assertEqual(self.db.category('\U00020000'), 'Lo')
140        self.assertEqual(self.db.category('\U0001012A'), 'No')
141
142        self.assertRaises(TypeError, self.db.category)
143        self.assertRaises(TypeError, self.db.category, 'xx')
144
145    def test_bidirectional(self):
146        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
147        self.assertEqual(self.db.bidirectional(' '), 'WS')
148        self.assertEqual(self.db.bidirectional('A'), 'L')
149        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
150
151        self.assertRaises(TypeError, self.db.bidirectional)
152        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
153
154    def test_decomposition(self):
155        self.assertEqual(self.db.decomposition('\uFFFE'),'')
156        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
157
158        self.assertRaises(TypeError, self.db.decomposition)
159        self.assertRaises(TypeError, self.db.decomposition, 'xx')
160
161    def test_mirrored(self):
162        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
163        self.assertEqual(self.db.mirrored('a'), 0)
164        self.assertEqual(self.db.mirrored('\u2201'), 1)
165        self.assertEqual(self.db.mirrored('\U00020000'), 0)
166
167        self.assertRaises(TypeError, self.db.mirrored)
168        self.assertRaises(TypeError, self.db.mirrored, 'xx')
169
170    def test_combining(self):
171        self.assertEqual(self.db.combining('\uFFFE'), 0)
172        self.assertEqual(self.db.combining('a'), 0)
173        self.assertEqual(self.db.combining('\u20e1'), 230)
174        self.assertEqual(self.db.combining('\U00020000'), 0)
175
176        self.assertRaises(TypeError, self.db.combining)
177        self.assertRaises(TypeError, self.db.combining, 'xx')
178
179    def test_pr29(self):
180        # https://www.unicode.org/review/pr-29.html
181        # See issues #1054943 and #10254.
182        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
183                    'Li\u030dt-s\u1e73\u0301',
184                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
185                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
186                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
187                    + '\u0938\u094d\u0924\u093e\u0928')
188        for text in composed:
189            self.assertEqual(self.db.normalize('NFC', text), text)
190
191    def test_issue10254(self):
192        # Crash reported in #10254
193        a = 'C\u0338' * 20  + 'C\u0327'
194        b = 'C\u0338' * 20  + '\xC7'
195        self.assertEqual(self.db.normalize('NFC', a), b)
196
197    def test_issue29456(self):
198        # Fix #29456
199        u1176_str_a = '\u1100\u1176\u11a8'
200        u1176_str_b = '\u1100\u1176\u11a8'
201        u11a7_str_a = '\u1100\u1175\u11a7'
202        u11a7_str_b = '\uae30\u11a7'
203        u11c3_str_a = '\u1100\u1175\u11c3'
204        u11c3_str_b = '\uae30\u11c3'
205        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
206        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
207        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
208
209    def test_east_asian_width(self):
210        eaw = self.db.east_asian_width
211        self.assertRaises(TypeError, eaw, b'a')
212        self.assertRaises(TypeError, eaw, bytearray())
213        self.assertRaises(TypeError, eaw, '')
214        self.assertRaises(TypeError, eaw, 'ra')
215        self.assertEqual(eaw('\x1e'), 'N')
216        self.assertEqual(eaw('\x20'), 'Na')
217        self.assertEqual(eaw('\uC894'), 'W')
218        self.assertEqual(eaw('\uFF66'), 'H')
219        self.assertEqual(eaw('\uFF1F'), 'F')
220        self.assertEqual(eaw('\u2010'), 'A')
221        self.assertEqual(eaw('\U00020000'), 'W')
222
223    def test_east_asian_width_9_0_changes(self):
224        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
225        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
226
227class UnicodeMiscTest(UnicodeDatabaseTest):
228
229    @cpython_only
230    def test_disallow_instantiation(self):
231        # Ensure that the type disallows instantiation (bpo-43916)
232        check_disallow_instantiation(self, unicodedata.UCD)
233
234    def test_failed_import_during_compiling(self):
235        # Issue 4367
236        # Decoding \N escapes requires the unicodedata module. If it can't be
237        # imported, we shouldn't segfault.
238
239        # This program should raise a SyntaxError in the eval.
240        code = "import sys;" \
241            "sys.modules['unicodedata'] = None;" \
242            """eval("'\\\\N{SOFT HYPHEN}'")"""
243        # We use a separate process because the unicodedata module may already
244        # have been loaded in this process.
245        result = script_helper.assert_python_failure("-c", code)
246        error = "SyntaxError: (unicode error) \\N escapes not supported " \
247            "(can't load unicodedata module)"
248        self.assertIn(error, result.err.decode("ascii"))
249
250    def test_decimal_numeric_consistent(self):
251        # Test that decimal and numeric are consistent,
252        # i.e. if a character has a decimal value,
253        # its numeric value should be the same.
254        count = 0
255        for i in range(0x10000):
256            c = chr(i)
257            dec = self.db.decimal(c, -1)
258            if dec != -1:
259                self.assertEqual(dec, self.db.numeric(c))
260                count += 1
261        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
262
263    def test_digit_numeric_consistent(self):
264        # Test that digit and numeric are consistent,
265        # i.e. if a character has a digit value,
266        # its numeric value should be the same.
267        count = 0
268        for i in range(0x10000):
269            c = chr(i)
270            dec = self.db.digit(c, -1)
271            if dec != -1:
272                self.assertEqual(dec, self.db.numeric(c))
273                count += 1
274        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
275
276    def test_bug_1704793(self):
277        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
278
279    def test_ucd_510(self):
280        import unicodedata
281        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
282        self.assertTrue(unicodedata.mirrored("\u0f3a"))
283        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
284        # Also, we now have two ways of representing
285        # the upper-case mapping: as delta, or as absolute value
286        self.assertTrue("a".upper()=='A')
287        self.assertTrue("\u1d79".upper()=='\ua77d')
288        self.assertTrue(".".upper()=='.')
289
290    def test_bug_5828(self):
291        self.assertEqual("\u1d79".lower(), "\u1d79")
292        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
293        self.assertEqual(
294            [
295                c for c in range(sys.maxunicode+1)
296                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
297            ],
298            [0]
299        )
300
301    def test_bug_4971(self):
302        # LETTER DZ WITH CARON: DZ, Dz, dz
303        self.assertEqual("\u01c4".title(), "\u01c5")
304        self.assertEqual("\u01c5".title(), "\u01c5")
305        self.assertEqual("\u01c6".title(), "\u01c5")
306
307    def test_linebreak_7643(self):
308        for i in range(0x10000):
309            lines = (chr(i) + 'A').splitlines()
310            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
311                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
312                self.assertEqual(len(lines), 2,
313                                 r"\u%.4x should be a linebreak" % i)
314            else:
315                self.assertEqual(len(lines), 1,
316                                 r"\u%.4x should not be a linebreak" % i)
317
318class NormalizationTest(unittest.TestCase):
319    @staticmethod
320    def check_version(testfile):
321        hdr = testfile.readline()
322        return unicodedata.unidata_version in hdr
323
324    @staticmethod
325    def unistr(data):
326        data = [int(x, 16) for x in data.split(" ")]
327        return "".join([chr(x) for x in data])
328
329    @requires_resource('network')
330    def test_normalization(self):
331        TESTDATAFILE = "NormalizationTest.txt"
332        TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
333
334        # Hit the exception early
335        try:
336            testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
337                                        check=self.check_version)
338        except PermissionError:
339            self.skipTest(f"Permission error when downloading {TESTDATAURL} "
340                          f"into the test data directory")
341        except (OSError, HTTPException):
342            self.fail(f"Could not retrieve {TESTDATAURL}")
343
344        with testdata:
345            self.run_normalization_tests(testdata)
346
347    def run_normalization_tests(self, testdata):
348        part = None
349        part1_data = {}
350
351        def NFC(str):
352            return unicodedata.normalize("NFC", str)
353
354        def NFKC(str):
355            return unicodedata.normalize("NFKC", str)
356
357        def NFD(str):
358            return unicodedata.normalize("NFD", str)
359
360        def NFKD(str):
361            return unicodedata.normalize("NFKD", str)
362
363        for line in testdata:
364            if '#' in line:
365                line = line.split('#')[0]
366            line = line.strip()
367            if not line:
368                continue
369            if line.startswith("@Part"):
370                part = line.split()[0]
371                continue
372            c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
373
374            # Perform tests
375            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
376            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
377            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
378            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
379            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
380                            NFKC(c3) == NFKC(c4) == NFKC(c5),
381                            line)
382            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
383                            NFKD(c3) == NFKD(c4) == NFKD(c5),
384                            line)
385
386            self.assertTrue(unicodedata.is_normalized("NFC", c2))
387            self.assertTrue(unicodedata.is_normalized("NFC", c4))
388
389            self.assertTrue(unicodedata.is_normalized("NFD", c3))
390            self.assertTrue(unicodedata.is_normalized("NFD", c5))
391
392            self.assertTrue(unicodedata.is_normalized("NFKC", c4))
393            self.assertTrue(unicodedata.is_normalized("NFKD", c5))
394
395            # Record part 1 data
396            if part == "@Part1":
397                part1_data[c1] = 1
398
399        # Perform tests for all other data
400        for c in range(sys.maxunicode+1):
401            X = chr(c)
402            if X in part1_data:
403                continue
404            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
405
406    def test_edge_cases(self):
407        self.assertRaises(TypeError, unicodedata.normalize)
408        self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
409        self.assertEqual(unicodedata.normalize('NFKC', ''), '')
410
411    def test_bug_834676(self):
412        # Check for bug 834676
413        unicodedata.normalize('NFC', '\ud55c\uae00')
414
415
416if __name__ == "__main__":
417    unittest.main()
418