1""" Test script for the unicodedata module. 2 3 Written by Marc-Andre Lemburg (mal@lemburg.com). 4 5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8 9import sys 10import unittest 11import hashlib 12from test.support import script_helper 13 14encoding = 'utf-8' 15errors = 'surrogatepass' 16 17 18### Run tests 19 20class UnicodeMethodsTest(unittest.TestCase): 21 22 # update this, if the database changes 23 expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1' 24 25 def test_method_checksum(self): 26 h = hashlib.sha1() 27 for i in range(0x10000): 28 char = chr(i) 29 data = [ 30 # Predicates (single char) 31 "01"[char.isalnum()], 32 "01"[char.isalpha()], 33 "01"[char.isdecimal()], 34 "01"[char.isdigit()], 35 "01"[char.islower()], 36 "01"[char.isnumeric()], 37 "01"[char.isspace()], 38 "01"[char.istitle()], 39 "01"[char.isupper()], 40 41 # Predicates (multiple chars) 42 "01"[(char + 'abc').isalnum()], 43 "01"[(char + 'abc').isalpha()], 44 "01"[(char + '123').isdecimal()], 45 "01"[(char + '123').isdigit()], 46 "01"[(char + 'abc').islower()], 47 "01"[(char + '123').isnumeric()], 48 "01"[(char + ' \t').isspace()], 49 "01"[(char + 'abc').istitle()], 50 "01"[(char + 'ABC').isupper()], 51 52 # Mappings (single char) 53 char.lower(), 54 char.upper(), 55 char.title(), 56 57 # Mappings (multiple chars) 58 (char + 'abc').lower(), 59 (char + 'ABC').upper(), 60 (char + 'abc').title(), 61 (char + 'ABC').title(), 62 63 ] 64 h.update(''.join(data).encode(encoding, errors)) 65 result = h.hexdigest() 66 self.assertEqual(result, self.expectedchecksum) 67 68class UnicodeDatabaseTest(unittest.TestCase): 69 70 def setUp(self): 71 # In case unicodedata is not available, this will raise an ImportError, 72 # but the other test cases will still be run 73 import unicodedata 74 self.db = unicodedata 75 76 def tearDown(self): 77 del self.db 78 79class UnicodeFunctionsTest(UnicodeDatabaseTest): 80 81 # Update this if the database changes. Make sure to do a full rebuild 82 # (e.g. 'make distclean && make') to get the correct checksum. 83 expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652' 84 def test_function_checksum(self): 85 data = [] 86 h = hashlib.sha1() 87 88 for i in range(0x10000): 89 char = chr(i) 90 data = [ 91 # Properties 92 format(self.db.digit(char, -1), '.12g'), 93 format(self.db.numeric(char, -1), '.12g'), 94 format(self.db.decimal(char, -1), '.12g'), 95 self.db.category(char), 96 self.db.bidirectional(char), 97 self.db.decomposition(char), 98 str(self.db.mirrored(char)), 99 str(self.db.combining(char)), 100 ] 101 h.update(''.join(data).encode("ascii")) 102 result = h.hexdigest() 103 self.assertEqual(result, self.expectedchecksum) 104 105 def test_digit(self): 106 self.assertEqual(self.db.digit('A', None), None) 107 self.assertEqual(self.db.digit('9'), 9) 108 self.assertEqual(self.db.digit('\u215b', None), None) 109 self.assertEqual(self.db.digit('\u2468'), 9) 110 self.assertEqual(self.db.digit('\U00020000', None), None) 111 self.assertEqual(self.db.digit('\U0001D7FD'), 7) 112 113 self.assertRaises(TypeError, self.db.digit) 114 self.assertRaises(TypeError, self.db.digit, 'xx') 115 self.assertRaises(ValueError, self.db.digit, 'x') 116 117 def test_numeric(self): 118 self.assertEqual(self.db.numeric('A',None), None) 119 self.assertEqual(self.db.numeric('9'), 9) 120 self.assertEqual(self.db.numeric('\u215b'), 0.125) 121 self.assertEqual(self.db.numeric('\u2468'), 9.0) 122 self.assertEqual(self.db.numeric('\ua627'), 7.0) 123 self.assertEqual(self.db.numeric('\U00020000', None), None) 124 self.assertEqual(self.db.numeric('\U0001012A'), 9000) 125 126 self.assertRaises(TypeError, self.db.numeric) 127 self.assertRaises(TypeError, self.db.numeric, 'xx') 128 self.assertRaises(ValueError, self.db.numeric, 'x') 129 130 def test_decimal(self): 131 self.assertEqual(self.db.decimal('A',None), None) 132 self.assertEqual(self.db.decimal('9'), 9) 133 self.assertEqual(self.db.decimal('\u215b', None), None) 134 self.assertEqual(self.db.decimal('\u2468', None), None) 135 self.assertEqual(self.db.decimal('\U00020000', None), None) 136 self.assertEqual(self.db.decimal('\U0001D7FD'), 7) 137 138 self.assertRaises(TypeError, self.db.decimal) 139 self.assertRaises(TypeError, self.db.decimal, 'xx') 140 self.assertRaises(ValueError, self.db.decimal, 'x') 141 142 def test_category(self): 143 self.assertEqual(self.db.category('\uFFFE'), 'Cn') 144 self.assertEqual(self.db.category('a'), 'Ll') 145 self.assertEqual(self.db.category('A'), 'Lu') 146 self.assertEqual(self.db.category('\U00020000'), 'Lo') 147 self.assertEqual(self.db.category('\U0001012A'), 'No') 148 149 self.assertRaises(TypeError, self.db.category) 150 self.assertRaises(TypeError, self.db.category, 'xx') 151 152 def test_bidirectional(self): 153 self.assertEqual(self.db.bidirectional('\uFFFE'), '') 154 self.assertEqual(self.db.bidirectional(' '), 'WS') 155 self.assertEqual(self.db.bidirectional('A'), 'L') 156 self.assertEqual(self.db.bidirectional('\U00020000'), 'L') 157 158 self.assertRaises(TypeError, self.db.bidirectional) 159 self.assertRaises(TypeError, self.db.bidirectional, 'xx') 160 161 def test_decomposition(self): 162 self.assertEqual(self.db.decomposition('\uFFFE'),'') 163 self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034') 164 165 self.assertRaises(TypeError, self.db.decomposition) 166 self.assertRaises(TypeError, self.db.decomposition, 'xx') 167 168 def test_mirrored(self): 169 self.assertEqual(self.db.mirrored('\uFFFE'), 0) 170 self.assertEqual(self.db.mirrored('a'), 0) 171 self.assertEqual(self.db.mirrored('\u2201'), 1) 172 self.assertEqual(self.db.mirrored('\U00020000'), 0) 173 174 self.assertRaises(TypeError, self.db.mirrored) 175 self.assertRaises(TypeError, self.db.mirrored, 'xx') 176 177 def test_combining(self): 178 self.assertEqual(self.db.combining('\uFFFE'), 0) 179 self.assertEqual(self.db.combining('a'), 0) 180 self.assertEqual(self.db.combining('\u20e1'), 230) 181 self.assertEqual(self.db.combining('\U00020000'), 0) 182 183 self.assertRaises(TypeError, self.db.combining) 184 self.assertRaises(TypeError, self.db.combining, 'xx') 185 186 def test_normalize(self): 187 self.assertRaises(TypeError, self.db.normalize) 188 self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx') 189 self.assertEqual(self.db.normalize('NFKC', ''), '') 190 # The rest can be found in test_normalization.py 191 # which requires an external file. 192 193 def test_pr29(self): 194 # http://www.unicode.org/review/pr-29.html 195 # See issues #1054943 and #10254. 196 composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161", 197 'Li\u030dt-s\u1e73\u0301', 198 '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c' 199 + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917', 200 '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c' 201 + '\u0938\u094d\u0924\u093e\u0928') 202 for text in composed: 203 self.assertEqual(self.db.normalize('NFC', text), text) 204 205 def test_issue10254(self): 206 # Crash reported in #10254 207 a = 'C\u0338' * 20 + 'C\u0327' 208 b = 'C\u0338' * 20 + '\xC7' 209 self.assertEqual(self.db.normalize('NFC', a), b) 210 211 def test_issue29456(self): 212 # Fix #29456 213 u1176_str_a = '\u1100\u1176\u11a8' 214 u1176_str_b = '\u1100\u1176\u11a8' 215 u11a7_str_a = '\u1100\u1175\u11a7' 216 u11a7_str_b = '\uae30\u11a7' 217 u11c3_str_a = '\u1100\u1175\u11c3' 218 u11c3_str_b = '\uae30\u11c3' 219 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) 220 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) 221 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) 222 223 # For tests of unicodedata.is_normalized / self.db.is_normalized , 224 # see test_normalization.py . 225 226 def test_east_asian_width(self): 227 eaw = self.db.east_asian_width 228 self.assertRaises(TypeError, eaw, b'a') 229 self.assertRaises(TypeError, eaw, bytearray()) 230 self.assertRaises(TypeError, eaw, '') 231 self.assertRaises(TypeError, eaw, 'ra') 232 self.assertEqual(eaw('\x1e'), 'N') 233 self.assertEqual(eaw('\x20'), 'Na') 234 self.assertEqual(eaw('\uC894'), 'W') 235 self.assertEqual(eaw('\uFF66'), 'H') 236 self.assertEqual(eaw('\uFF1F'), 'F') 237 self.assertEqual(eaw('\u2010'), 'A') 238 self.assertEqual(eaw('\U00020000'), 'W') 239 240 def test_east_asian_width_9_0_changes(self): 241 self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') 242 self.assertEqual(self.db.east_asian_width('\u231a'), 'W') 243 244class UnicodeMiscTest(UnicodeDatabaseTest): 245 246 def test_failed_import_during_compiling(self): 247 # Issue 4367 248 # Decoding \N escapes requires the unicodedata module. If it can't be 249 # imported, we shouldn't segfault. 250 251 # This program should raise a SyntaxError in the eval. 252 code = "import sys;" \ 253 "sys.modules['unicodedata'] = None;" \ 254 """eval("'\\\\N{SOFT HYPHEN}'")""" 255 # We use a separate process because the unicodedata module may already 256 # have been loaded in this process. 257 result = script_helper.assert_python_failure("-c", code) 258 error = "SyntaxError: (unicode error) \\N escapes not supported " \ 259 "(can't load unicodedata module)" 260 self.assertIn(error, result.err.decode("ascii")) 261 262 def test_decimal_numeric_consistent(self): 263 # Test that decimal and numeric are consistent, 264 # i.e. if a character has a decimal value, 265 # its numeric value should be the same. 266 count = 0 267 for i in range(0x10000): 268 c = chr(i) 269 dec = self.db.decimal(c, -1) 270 if dec != -1: 271 self.assertEqual(dec, self.db.numeric(c)) 272 count += 1 273 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 274 275 def test_digit_numeric_consistent(self): 276 # Test that digit and numeric are consistent, 277 # i.e. if a character has a digit value, 278 # its numeric value should be the same. 279 count = 0 280 for i in range(0x10000): 281 c = chr(i) 282 dec = self.db.digit(c, -1) 283 if dec != -1: 284 self.assertEqual(dec, self.db.numeric(c)) 285 count += 1 286 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 287 288 def test_bug_1704793(self): 289 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346') 290 291 def test_ucd_510(self): 292 import unicodedata 293 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 294 self.assertTrue(unicodedata.mirrored("\u0f3a")) 295 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a")) 296 # Also, we now have two ways of representing 297 # the upper-case mapping: as delta, or as absolute value 298 self.assertTrue("a".upper()=='A') 299 self.assertTrue("\u1d79".upper()=='\ua77d') 300 self.assertTrue(".".upper()=='.') 301 302 def test_bug_5828(self): 303 self.assertEqual("\u1d79".lower(), "\u1d79") 304 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant 305 self.assertEqual( 306 [ 307 c for c in range(sys.maxunicode+1) 308 if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title() 309 ], 310 [0] 311 ) 312 313 def test_bug_4971(self): 314 # LETTER DZ WITH CARON: DZ, Dz, dz 315 self.assertEqual("\u01c4".title(), "\u01c5") 316 self.assertEqual("\u01c5".title(), "\u01c5") 317 self.assertEqual("\u01c6".title(), "\u01c5") 318 319 def test_linebreak_7643(self): 320 for i in range(0x10000): 321 lines = (chr(i) + 'A').splitlines() 322 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, 323 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): 324 self.assertEqual(len(lines), 2, 325 r"\u%.4x should be a linebreak" % i) 326 else: 327 self.assertEqual(len(lines), 1, 328 r"\u%.4x should not be a linebreak" % i) 329 330if __name__ == "__main__": 331 unittest.main() 332