1""" Test script for the Unicode implementation. 2 3Written by Bill Tutt. 4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import unittest 11import unicodedata 12 13from test import support 14from http.client import HTTPException 15from test.test_normalization import check_version 16 17try: 18 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX 19except ImportError: 20 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1 21 22class UnicodeNamesTest(unittest.TestCase): 23 24 def checkletter(self, name, code): 25 # Helper that put all \N escapes inside eval'd raw strings, 26 # to make sure this script runs even if the compiler 27 # chokes on \N escapes 28 res = eval(r'"\N{%s}"' % name) 29 self.assertEqual(res, code) 30 return res 31 32 def test_general(self): 33 # General and case insensitivity test: 34 chars = [ 35 "LATIN CAPITAL LETTER T", 36 "LATIN SMALL LETTER H", 37 "LATIN SMALL LETTER E", 38 "SPACE", 39 "LATIN SMALL LETTER R", 40 "LATIN CAPITAL LETTER E", 41 "LATIN SMALL LETTER D", 42 "SPACE", 43 "LATIN SMALL LETTER f", 44 "LATIN CAPITAL LeTtEr o", 45 "LATIN SMaLl LETTER x", 46 "SPACE", 47 "LATIN SMALL LETTER A", 48 "LATIN SMALL LETTER T", 49 "LATIN SMALL LETTER E", 50 "SPACE", 51 "LATIN SMALL LETTER T", 52 "LATIN SMALL LETTER H", 53 "LATIN SMALL LETTER E", 54 "SpAcE", 55 "LATIN SMALL LETTER S", 56 "LATIN SMALL LETTER H", 57 "LATIN small LETTER e", 58 "LATIN small LETTER e", 59 "LATIN SMALL LETTER P", 60 "FULL STOP" 61 ] 62 string = "The rEd fOx ate the sheep." 63 64 self.assertEqual( 65 "".join([self.checkletter(*args) for args in zip(chars, string)]), 66 string 67 ) 68 69 def test_ascii_letters(self): 70 for char in "".join(map(chr, range(ord("a"), ord("z")))): 71 name = "LATIN SMALL LETTER %s" % char.upper() 72 code = unicodedata.lookup(name) 73 self.assertEqual(unicodedata.name(code), name) 74 75 def test_hangul_syllables(self): 76 self.checkletter("HANGUL SYLLABLE GA", "\uac00") 77 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8") 78 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0") 79 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8") 80 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0") 81 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88") 82 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370") 83 self.checkletter("HANGUL SYLLABLE YI", "\uc758") 84 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40") 85 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28") 86 self.checkletter("HANGUL SYLLABLE PAN", "\ud310") 87 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8") 88 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3") 89 90 self.assertRaises(ValueError, unicodedata.name, "\ud7a4") 91 92 def test_cjk_unified_ideographs(self): 93 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400") 94 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5") 95 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00") 96 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB") 97 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000") 98 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6") 99 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700") 100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734") 101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740") 102 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D") 103 104 def test_bmp_characters(self): 105 for code in range(0x10000): 106 char = chr(code) 107 name = unicodedata.name(char, None) 108 if name is not None: 109 self.assertEqual(unicodedata.lookup(name), char) 110 111 def test_misc_symbols(self): 112 self.checkletter("PILCROW SIGN", "\u00b6") 113 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD") 114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F") 115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41") 116 117 def test_aliases(self): 118 # Check that the aliases defined in the NameAliases.txt file work. 119 # This should be updated when new aliases are added or the file 120 # should be downloaded and parsed instead. See #12753. 121 aliases = [ 122 ('LATIN CAPITAL LETTER GHA', 0x01A2), 123 ('LATIN SMALL LETTER GHA', 0x01A3), 124 ('KANNADA LETTER LLLA', 0x0CDE), 125 ('LAO LETTER FO FON', 0x0E9D), 126 ('LAO LETTER FO FAY', 0x0E9F), 127 ('LAO LETTER RO', 0x0EA3), 128 ('LAO LETTER LO', 0x0EA5), 129 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), 130 ('YI SYLLABLE ITERATION MARK', 0xA015), 131 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), 132 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) 133 ] 134 for alias, codepoint in aliases: 135 self.checkletter(alias, chr(codepoint)) 136 name = unicodedata.name(chr(codepoint)) 137 self.assertNotEqual(name, alias) 138 self.assertEqual(unicodedata.lookup(alias), 139 unicodedata.lookup(name)) 140 with self.assertRaises(KeyError): 141 unicodedata.ucd_3_2_0.lookup(alias) 142 143 def test_aliases_names_in_pua_range(self): 144 # We are storing aliases in the PUA 15, but their names shouldn't leak 145 for cp in range(0xf0000, 0xf0100): 146 with self.assertRaises(ValueError) as cm: 147 unicodedata.name(chr(cp)) 148 self.assertEqual(str(cm.exception), 'no such name') 149 150 def test_named_sequences_names_in_pua_range(self): 151 # We are storing named seq in the PUA 15, but their names shouldn't leak 152 for cp in range(0xf0100, 0xf0fff): 153 with self.assertRaises(ValueError) as cm: 154 unicodedata.name(chr(cp)) 155 self.assertEqual(str(cm.exception), 'no such name') 156 157 def test_named_sequences_sample(self): 158 # Check a few named sequences. See #12753. 159 sequences = [ 160 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'), 161 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'), 162 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'), 163 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'), 164 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'), 165 ] 166 for seqname, codepoints in sequences: 167 self.assertEqual(unicodedata.lookup(seqname), codepoints) 168 with self.assertRaises(SyntaxError): 169 self.checkletter(seqname, None) 170 with self.assertRaises(KeyError): 171 unicodedata.ucd_3_2_0.lookup(seqname) 172 173 def test_named_sequences_full(self): 174 # Check all the named sequences 175 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" % 176 unicodedata.unidata_version) 177 try: 178 testdata = support.open_urlresource(url, encoding="utf-8", 179 check=check_version) 180 except (OSError, HTTPException): 181 self.skipTest("Could not retrieve " + url) 182 self.addCleanup(testdata.close) 183 for line in testdata: 184 line = line.strip() 185 if not line or line.startswith('#'): 186 continue 187 seqname, codepoints = line.split(';') 188 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) 189 self.assertEqual(unicodedata.lookup(seqname), codepoints) 190 with self.assertRaises(SyntaxError): 191 self.checkletter(seqname, None) 192 with self.assertRaises(KeyError): 193 unicodedata.ucd_3_2_0.lookup(seqname) 194 195 def test_errors(self): 196 self.assertRaises(TypeError, unicodedata.name) 197 self.assertRaises(TypeError, unicodedata.name, 'xx') 198 self.assertRaises(TypeError, unicodedata.lookup) 199 self.assertRaises(KeyError, unicodedata.lookup, 'unknown') 200 201 def test_strict_error_handling(self): 202 # bogus character name 203 self.assertRaises( 204 UnicodeError, 205 str, b"\\N{blah}", 'unicode-escape', 'strict' 206 ) 207 # long bogus character name 208 self.assertRaises( 209 UnicodeError, 210 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict' 211 ) 212 # missing closing brace 213 self.assertRaises( 214 UnicodeError, 215 str, b"\\N{SPACE", 'unicode-escape', 'strict' 216 ) 217 # missing opening brace 218 self.assertRaises( 219 UnicodeError, 220 str, b"\\NSPACE", 'unicode-escape', 'strict' 221 ) 222 223 @support.cpython_only 224 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX") 225 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False) 226 def test_issue16335(self, size): 227 # very very long bogus character name 228 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}' 229 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1)) 230 self.assertRaisesRegex(UnicodeError, 231 'unknown Unicode character name', 232 x.decode, 'unicode-escape' 233 ) 234 235 236if __name__ == "__main__": 237 unittest.main() 238