• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import unittest
11import sys
12
13from test import test_support
14
15try:
16    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
17except ImportError:
18    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
19
20class UnicodeNamesTest(unittest.TestCase):
21
22    def checkletter(self, name, code):
23        # Helper that put all \N escapes inside eval'd raw strings,
24        # to make sure this script runs even if the compiler
25        # chokes on \N escapes
26        res = eval(ur'u"\N{%s}"' % name)
27        self.assertEqual(res, code)
28        return res
29
30    def test_general(self):
31        # General and case insensitivity test:
32        chars = [
33            "LATIN CAPITAL LETTER T",
34            "LATIN SMALL LETTER H",
35            "LATIN SMALL LETTER E",
36            "SPACE",
37            "LATIN SMALL LETTER R",
38            "LATIN CAPITAL LETTER E",
39            "LATIN SMALL LETTER D",
40            "SPACE",
41            "LATIN SMALL LETTER f",
42            "LATIN CAPITAL LeTtEr o",
43            "LATIN SMaLl LETTER x",
44            "SPACE",
45            "LATIN SMALL LETTER A",
46            "LATIN SMALL LETTER T",
47            "LATIN SMALL LETTER E",
48            "SPACE",
49            "LATIN SMALL LETTER T",
50            "LATIN SMALL LETTER H",
51            "LATIN SMALL LETTER E",
52            "SpAcE",
53            "LATIN SMALL LETTER S",
54            "LATIN SMALL LETTER H",
55            "LATIN small LETTER e",
56            "LATIN small LETTER e",
57            "LATIN SMALL LETTER P",
58            "FULL STOP"
59        ]
60        string = u"The rEd fOx ate the sheep."
61
62        self.assertEqual(
63            u"".join([self.checkletter(*args) for args in zip(chars, string)]),
64            string
65        )
66
67    def test_ascii_letters(self):
68        import unicodedata
69
70        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
71            name = "LATIN SMALL LETTER %s" % char.upper()
72            code = unicodedata.lookup(name)
73            self.assertEqual(unicodedata.name(code), name)
74
75    def test_hangul_syllables(self):
76        self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
77        self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
78        self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
79        self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
80        self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
81        self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
82        self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
83        self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
84        self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
85        self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
86        self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
87        self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
88        self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
89
90        import unicodedata
91        self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
92
93    def test_cjk_unified_ideographs(self):
94        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
95        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
96        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
97        self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
98        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
99        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
100
101    def test_bmp_characters(self):
102        import unicodedata
103        count = 0
104        for code in xrange(0x10000):
105            char = unichr(code)
106            name = unicodedata.name(char, None)
107            if name is not None:
108                self.assertEqual(unicodedata.lookup(name), char)
109                count += 1
110
111    def test_misc_symbols(self):
112        self.checkletter("PILCROW SIGN", u"\u00b6")
113        self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
114        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
115        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
116
117    def test_errors(self):
118        import unicodedata
119        self.assertRaises(TypeError, unicodedata.name)
120        self.assertRaises(TypeError, unicodedata.name, u'xx')
121        self.assertRaises(TypeError, unicodedata.lookup)
122        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
123
124    def test_strict_eror_handling(self):
125        # bogus character name
126        self.assertRaises(
127            UnicodeError,
128            unicode, "\\N{blah}", 'unicode-escape', 'strict'
129        )
130        # long bogus character name
131        self.assertRaises(
132            UnicodeError,
133            unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
134        )
135        # missing closing brace
136        self.assertRaises(
137            UnicodeError,
138            unicode, "\\N{SPACE", 'unicode-escape', 'strict'
139        )
140        # missing opening brace
141        self.assertRaises(
142            UnicodeError,
143            unicode, "\\NSPACE", 'unicode-escape', 'strict'
144        )
145
146    @test_support.cpython_only
147    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
148    @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint")
149    @test_support.bigmemtest(minsize=UINT_MAX + 1,
150                             memuse=2 + 4 // len(u'\U00010000'))
151    def test_issue16335(self, size):
152        func = self.test_issue16335
153        if size < func.minsize:
154            raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
155                    (func.minsize * func.memuse / float(1024**3),))
156        # very very long bogus character name
157        x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}'
158        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
159        self.assertRaisesRegexp(UnicodeError,
160            'unknown Unicode character name',
161            x.decode, 'unicode-escape'
162        )
163
164
165def test_main():
166    test_support.run_unittest(UnicodeNamesTest)
167
168if __name__ == "__main__":
169    test_main()
170