1""" Tests for the unicodedata module. 2 3 Written by Marc-Andre Lemburg (mal@lemburg.com). 4 5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8 9import hashlib 10from http.client import HTTPException 11import sys 12import unicodedata 13import unittest 14from test.support import (open_urlresource, requires_resource, script_helper, 15 cpython_only, check_disallow_instantiation) 16 17 18class UnicodeMethodsTest(unittest.TestCase): 19 20 # update this, if the database changes 21 expectedchecksum = 'fbdf8106a3c7c242086b0a9efa03ad4d30d5b85d' 22 23 @requires_resource('cpu') 24 def test_method_checksum(self): 25 h = hashlib.sha1() 26 for i in range(sys.maxunicode + 1): 27 char = chr(i) 28 data = [ 29 # Predicates (single char) 30 "01"[char.isalnum()], 31 "01"[char.isalpha()], 32 "01"[char.isdecimal()], 33 "01"[char.isdigit()], 34 "01"[char.islower()], 35 "01"[char.isnumeric()], 36 "01"[char.isspace()], 37 "01"[char.istitle()], 38 "01"[char.isupper()], 39 40 # Predicates (multiple chars) 41 "01"[(char + 'abc').isalnum()], 42 "01"[(char + 'abc').isalpha()], 43 "01"[(char + '123').isdecimal()], 44 "01"[(char + '123').isdigit()], 45 "01"[(char + 'abc').islower()], 46 "01"[(char + '123').isnumeric()], 47 "01"[(char + ' \t').isspace()], 48 "01"[(char + 'abc').istitle()], 49 "01"[(char + 'ABC').isupper()], 50 51 # Mappings (single char) 52 char.lower(), 53 char.upper(), 54 char.title(), 55 56 # Mappings (multiple chars) 57 (char + 'abc').lower(), 58 (char + 'ABC').upper(), 59 (char + 'abc').title(), 60 (char + 'ABC').title(), 61 62 ] 63 h.update(''.join(data).encode('utf-8', 'surrogatepass')) 64 result = h.hexdigest() 65 self.assertEqual(result, self.expectedchecksum) 66 67class UnicodeDatabaseTest(unittest.TestCase): 68 db = unicodedata 69 70class UnicodeFunctionsTest(UnicodeDatabaseTest): 71 72 # Update this if the database changes. Make sure to do a full rebuild 73 # (e.g. 'make distclean && make') to get the correct checksum. 74 expectedchecksum = 'd1e37a2854df60ac607b47b51189b9bf1b54bfdb' 75 76 @requires_resource('cpu') 77 def test_function_checksum(self): 78 data = [] 79 h = hashlib.sha1() 80 81 for i in range(sys.maxunicode + 1): 82 char = chr(i) 83 data = [ 84 # Properties 85 format(self.db.digit(char, -1), '.12g'), 86 format(self.db.numeric(char, -1), '.12g'), 87 format(self.db.decimal(char, -1), '.12g'), 88 self.db.category(char), 89 self.db.bidirectional(char), 90 self.db.decomposition(char), 91 str(self.db.mirrored(char)), 92 str(self.db.combining(char)), 93 ] 94 h.update(''.join(data).encode("ascii")) 95 result = h.hexdigest() 96 self.assertEqual(result, self.expectedchecksum) 97 98 def test_digit(self): 99 self.assertEqual(self.db.digit('A', None), None) 100 self.assertEqual(self.db.digit('9'), 9) 101 self.assertEqual(self.db.digit('\u215b', None), None) 102 self.assertEqual(self.db.digit('\u2468'), 9) 103 self.assertEqual(self.db.digit('\U00020000', None), None) 104 self.assertEqual(self.db.digit('\U0001D7FD'), 7) 105 106 self.assertRaises(TypeError, self.db.digit) 107 self.assertRaises(TypeError, self.db.digit, 'xx') 108 self.assertRaises(ValueError, self.db.digit, 'x') 109 110 def test_numeric(self): 111 self.assertEqual(self.db.numeric('A',None), None) 112 self.assertEqual(self.db.numeric('9'), 9) 113 self.assertEqual(self.db.numeric('\u215b'), 0.125) 114 self.assertEqual(self.db.numeric('\u2468'), 9.0) 115 self.assertEqual(self.db.numeric('\ua627'), 7.0) 116 self.assertEqual(self.db.numeric('\U00020000', None), None) 117 self.assertEqual(self.db.numeric('\U0001012A'), 9000) 118 119 self.assertRaises(TypeError, self.db.numeric) 120 self.assertRaises(TypeError, self.db.numeric, 'xx') 121 self.assertRaises(ValueError, self.db.numeric, 'x') 122 123 def test_decimal(self): 124 self.assertEqual(self.db.decimal('A',None), None) 125 self.assertEqual(self.db.decimal('9'), 9) 126 self.assertEqual(self.db.decimal('\u215b', None), None) 127 self.assertEqual(self.db.decimal('\u2468', None), None) 128 self.assertEqual(self.db.decimal('\U00020000', None), None) 129 self.assertEqual(self.db.decimal('\U0001D7FD'), 7) 130 131 self.assertRaises(TypeError, self.db.decimal) 132 self.assertRaises(TypeError, self.db.decimal, 'xx') 133 self.assertRaises(ValueError, self.db.decimal, 'x') 134 135 def test_category(self): 136 self.assertEqual(self.db.category('\uFFFE'), 'Cn') 137 self.assertEqual(self.db.category('a'), 'Ll') 138 self.assertEqual(self.db.category('A'), 'Lu') 139 self.assertEqual(self.db.category('\U00020000'), 'Lo') 140 self.assertEqual(self.db.category('\U0001012A'), 'No') 141 142 self.assertRaises(TypeError, self.db.category) 143 self.assertRaises(TypeError, self.db.category, 'xx') 144 145 def test_bidirectional(self): 146 self.assertEqual(self.db.bidirectional('\uFFFE'), '') 147 self.assertEqual(self.db.bidirectional(' '), 'WS') 148 self.assertEqual(self.db.bidirectional('A'), 'L') 149 self.assertEqual(self.db.bidirectional('\U00020000'), 'L') 150 151 self.assertRaises(TypeError, self.db.bidirectional) 152 self.assertRaises(TypeError, self.db.bidirectional, 'xx') 153 154 def test_decomposition(self): 155 self.assertEqual(self.db.decomposition('\uFFFE'),'') 156 self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034') 157 158 self.assertRaises(TypeError, self.db.decomposition) 159 self.assertRaises(TypeError, self.db.decomposition, 'xx') 160 161 def test_mirrored(self): 162 self.assertEqual(self.db.mirrored('\uFFFE'), 0) 163 self.assertEqual(self.db.mirrored('a'), 0) 164 self.assertEqual(self.db.mirrored('\u2201'), 1) 165 self.assertEqual(self.db.mirrored('\U00020000'), 0) 166 167 self.assertRaises(TypeError, self.db.mirrored) 168 self.assertRaises(TypeError, self.db.mirrored, 'xx') 169 170 def test_combining(self): 171 self.assertEqual(self.db.combining('\uFFFE'), 0) 172 self.assertEqual(self.db.combining('a'), 0) 173 self.assertEqual(self.db.combining('\u20e1'), 230) 174 self.assertEqual(self.db.combining('\U00020000'), 0) 175 176 self.assertRaises(TypeError, self.db.combining) 177 self.assertRaises(TypeError, self.db.combining, 'xx') 178 179 def test_pr29(self): 180 # https://www.unicode.org/review/pr-29.html 181 # See issues #1054943 and #10254. 182 composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161", 183 'Li\u030dt-s\u1e73\u0301', 184 '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c' 185 + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917', 186 '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c' 187 + '\u0938\u094d\u0924\u093e\u0928') 188 for text in composed: 189 self.assertEqual(self.db.normalize('NFC', text), text) 190 191 def test_issue10254(self): 192 # Crash reported in #10254 193 a = 'C\u0338' * 20 + 'C\u0327' 194 b = 'C\u0338' * 20 + '\xC7' 195 self.assertEqual(self.db.normalize('NFC', a), b) 196 197 def test_issue29456(self): 198 # Fix #29456 199 u1176_str_a = '\u1100\u1176\u11a8' 200 u1176_str_b = '\u1100\u1176\u11a8' 201 u11a7_str_a = '\u1100\u1175\u11a7' 202 u11a7_str_b = '\uae30\u11a7' 203 u11c3_str_a = '\u1100\u1175\u11c3' 204 u11c3_str_b = '\uae30\u11c3' 205 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) 206 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) 207 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) 208 209 def test_east_asian_width(self): 210 eaw = self.db.east_asian_width 211 self.assertRaises(TypeError, eaw, b'a') 212 self.assertRaises(TypeError, eaw, bytearray()) 213 self.assertRaises(TypeError, eaw, '') 214 self.assertRaises(TypeError, eaw, 'ra') 215 self.assertEqual(eaw('\x1e'), 'N') 216 self.assertEqual(eaw('\x20'), 'Na') 217 self.assertEqual(eaw('\uC894'), 'W') 218 self.assertEqual(eaw('\uFF66'), 'H') 219 self.assertEqual(eaw('\uFF1F'), 'F') 220 self.assertEqual(eaw('\u2010'), 'A') 221 self.assertEqual(eaw('\U00020000'), 'W') 222 223 def test_east_asian_width_9_0_changes(self): 224 self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') 225 self.assertEqual(self.db.east_asian_width('\u231a'), 'W') 226 227class UnicodeMiscTest(UnicodeDatabaseTest): 228 229 @cpython_only 230 def test_disallow_instantiation(self): 231 # Ensure that the type disallows instantiation (bpo-43916) 232 check_disallow_instantiation(self, unicodedata.UCD) 233 234 def test_failed_import_during_compiling(self): 235 # Issue 4367 236 # Decoding \N escapes requires the unicodedata module. If it can't be 237 # imported, we shouldn't segfault. 238 239 # This program should raise a SyntaxError in the eval. 240 code = "import sys;" \ 241 "sys.modules['unicodedata'] = None;" \ 242 """eval("'\\\\N{SOFT HYPHEN}'")""" 243 # We use a separate process because the unicodedata module may already 244 # have been loaded in this process. 245 result = script_helper.assert_python_failure("-c", code) 246 error = "SyntaxError: (unicode error) \\N escapes not supported " \ 247 "(can't load unicodedata module)" 248 self.assertIn(error, result.err.decode("ascii")) 249 250 def test_decimal_numeric_consistent(self): 251 # Test that decimal and numeric are consistent, 252 # i.e. if a character has a decimal value, 253 # its numeric value should be the same. 254 count = 0 255 for i in range(0x10000): 256 c = chr(i) 257 dec = self.db.decimal(c, -1) 258 if dec != -1: 259 self.assertEqual(dec, self.db.numeric(c)) 260 count += 1 261 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 262 263 def test_digit_numeric_consistent(self): 264 # Test that digit and numeric are consistent, 265 # i.e. if a character has a digit value, 266 # its numeric value should be the same. 267 count = 0 268 for i in range(0x10000): 269 c = chr(i) 270 dec = self.db.digit(c, -1) 271 if dec != -1: 272 self.assertEqual(dec, self.db.numeric(c)) 273 count += 1 274 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 275 276 def test_bug_1704793(self): 277 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346') 278 279 def test_ucd_510(self): 280 import unicodedata 281 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 282 self.assertTrue(unicodedata.mirrored("\u0f3a")) 283 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a")) 284 # Also, we now have two ways of representing 285 # the upper-case mapping: as delta, or as absolute value 286 self.assertTrue("a".upper()=='A') 287 self.assertTrue("\u1d79".upper()=='\ua77d') 288 self.assertTrue(".".upper()=='.') 289 290 def test_bug_5828(self): 291 self.assertEqual("\u1d79".lower(), "\u1d79") 292 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant 293 self.assertEqual( 294 [ 295 c for c in range(sys.maxunicode+1) 296 if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title() 297 ], 298 [0] 299 ) 300 301 def test_bug_4971(self): 302 # LETTER DZ WITH CARON: DZ, Dz, dz 303 self.assertEqual("\u01c4".title(), "\u01c5") 304 self.assertEqual("\u01c5".title(), "\u01c5") 305 self.assertEqual("\u01c6".title(), "\u01c5") 306 307 def test_linebreak_7643(self): 308 for i in range(0x10000): 309 lines = (chr(i) + 'A').splitlines() 310 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, 311 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): 312 self.assertEqual(len(lines), 2, 313 r"\u%.4x should be a linebreak" % i) 314 else: 315 self.assertEqual(len(lines), 1, 316 r"\u%.4x should not be a linebreak" % i) 317 318class NormalizationTest(unittest.TestCase): 319 @staticmethod 320 def check_version(testfile): 321 hdr = testfile.readline() 322 return unicodedata.unidata_version in hdr 323 324 @staticmethod 325 def unistr(data): 326 data = [int(x, 16) for x in data.split(" ")] 327 return "".join([chr(x) for x in data]) 328 329 @requires_resource('network') 330 def test_normalization(self): 331 TESTDATAFILE = "NormalizationTest.txt" 332 TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}" 333 334 # Hit the exception early 335 try: 336 testdata = open_urlresource(TESTDATAURL, encoding="utf-8", 337 check=self.check_version) 338 except PermissionError: 339 self.skipTest(f"Permission error when downloading {TESTDATAURL} " 340 f"into the test data directory") 341 except (OSError, HTTPException): 342 self.fail(f"Could not retrieve {TESTDATAURL}") 343 344 with testdata: 345 self.run_normalization_tests(testdata) 346 347 def run_normalization_tests(self, testdata): 348 part = None 349 part1_data = {} 350 351 def NFC(str): 352 return unicodedata.normalize("NFC", str) 353 354 def NFKC(str): 355 return unicodedata.normalize("NFKC", str) 356 357 def NFD(str): 358 return unicodedata.normalize("NFD", str) 359 360 def NFKD(str): 361 return unicodedata.normalize("NFKD", str) 362 363 for line in testdata: 364 if '#' in line: 365 line = line.split('#')[0] 366 line = line.strip() 367 if not line: 368 continue 369 if line.startswith("@Part"): 370 part = line.split()[0] 371 continue 372 c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]] 373 374 # Perform tests 375 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 376 self.assertTrue(c4 == NFC(c4) == NFC(c5), line) 377 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 378 self.assertTrue(c5 == NFD(c4) == NFD(c5), line) 379 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ 380 NFKC(c3) == NFKC(c4) == NFKC(c5), 381 line) 382 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ 383 NFKD(c3) == NFKD(c4) == NFKD(c5), 384 line) 385 386 self.assertTrue(unicodedata.is_normalized("NFC", c2)) 387 self.assertTrue(unicodedata.is_normalized("NFC", c4)) 388 389 self.assertTrue(unicodedata.is_normalized("NFD", c3)) 390 self.assertTrue(unicodedata.is_normalized("NFD", c5)) 391 392 self.assertTrue(unicodedata.is_normalized("NFKC", c4)) 393 self.assertTrue(unicodedata.is_normalized("NFKD", c5)) 394 395 # Record part 1 data 396 if part == "@Part1": 397 part1_data[c1] = 1 398 399 # Perform tests for all other data 400 for c in range(sys.maxunicode+1): 401 X = chr(c) 402 if X in part1_data: 403 continue 404 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 405 406 def test_edge_cases(self): 407 self.assertRaises(TypeError, unicodedata.normalize) 408 self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') 409 self.assertEqual(unicodedata.normalize('NFKC', ''), '') 410 411 def test_bug_834676(self): 412 # Check for bug 834676 413 unicodedata.normalize('NFC', '\ud55c\uae00') 414 415 416if __name__ == "__main__": 417 unittest.main() 418