• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# test_multibytecodec.py
2#   Unit test for multibytecodec itself
3#
4
5from test import test_support
6from test.test_support import TESTFN
7import unittest, StringIO, codecs, sys, os
8import _multibytecodec
9
10ALL_CJKENCODINGS = [
11# _codecs_cn
12    'gb2312', 'gbk', 'gb18030', 'hz',
13# _codecs_hk
14    'big5hkscs',
15# _codecs_jp
16    'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
17    'euc_jis_2004', 'shift_jis_2004',
18# _codecs_kr
19    'cp949', 'euc_kr', 'johab',
20# _codecs_tw
21    'big5', 'cp950',
22# _codecs_iso2022
23    'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
24    'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
25]
26
27class Test_MultibyteCodec(unittest.TestCase):
28
29    def test_nullcoding(self):
30        for enc in ALL_CJKENCODINGS:
31            self.assertEqual(''.decode(enc), u'')
32            self.assertEqual(unicode('', enc), u'')
33            self.assertEqual(u''.encode(enc), '')
34
35    def test_str_decode(self):
36        for enc in ALL_CJKENCODINGS:
37            self.assertEqual('abcd'.encode(enc), 'abcd')
38
39    def test_errorcallback_longindex(self):
40        dec = codecs.getdecoder('euc-kr')
41        myreplace  = lambda exc: (u'', sys.maxint+1)
42        codecs.register_error('test.cjktest', myreplace)
43        self.assertRaises(IndexError, dec,
44                          'apple\x92ham\x93spam', 'test.cjktest')
45
46    def test_errorcallback_custom_ignore(self):
47        # Issue #23215: MemoryError with custom error handlers and multibyte codecs
48        data = 100 * unichr(0xdc00)
49        codecs.register_error("test.ignore", codecs.ignore_errors)
50        for enc in ALL_CJKENCODINGS:
51            self.assertEqual(data.encode(enc, "test.ignore"), b'')
52
53    def test_codingspec(self):
54        for enc in ALL_CJKENCODINGS:
55            code = '# coding: {}\n'.format(enc)
56            exec code
57
58    def test_init_segfault(self):
59        # bug #3305: this used to segfault
60        self.assertRaises(AttributeError,
61                          _multibytecodec.MultibyteStreamReader, None)
62        self.assertRaises(AttributeError,
63                          _multibytecodec.MultibyteStreamWriter, None)
64
65
66class Test_IncrementalEncoder(unittest.TestCase):
67
68    def test_stateless(self):
69        # cp949 encoder isn't stateful at all.
70        encoder = codecs.getincrementalencoder('cp949')()
71        self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
72                         '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
73        self.assertEqual(encoder.reset(), None)
74        self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
75                         '\xa1\xd9\xa1\xad\xa1\xd9')
76        self.assertEqual(encoder.reset(), None)
77        self.assertEqual(encoder.encode(u'', True), '')
78        self.assertEqual(encoder.encode(u'', False), '')
79        self.assertEqual(encoder.reset(), None)
80
81    def test_stateful(self):
82        # jisx0213 encoder is stateful for a few code points. eg)
83        #   U+00E6 => A9DC
84        #   U+00E6 U+0300 => ABC4
85        #   U+0300 => ABDC
86
87        encoder = codecs.getincrementalencoder('jisx0213')()
88        self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
89        self.assertEqual(encoder.encode(u'\u00e6'), '')
90        self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
91        self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
92
93        self.assertEqual(encoder.reset(), None)
94        self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
95
96        self.assertEqual(encoder.encode(u'\u00e6'), '')
97        self.assertEqual(encoder.encode('', True), '\xa9\xdc')
98        self.assertEqual(encoder.encode('', True), '')
99
100    def test_stateful_keep_buffer(self):
101        encoder = codecs.getincrementalencoder('jisx0213')()
102        self.assertEqual(encoder.encode(u'\u00e6'), '')
103        self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
104        self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
105        self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
106        self.assertEqual(encoder.reset(), None)
107        self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
108        self.assertEqual(encoder.encode(u'\u00e6'), '')
109        self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
110        self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
111
112    def test_issue5640(self):
113        encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
114        self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
115        self.assertEqual(encoder.encode(u'\n'), b'\n')
116
117class Test_IncrementalDecoder(unittest.TestCase):
118
119    def test_dbcs(self):
120        # cp949 decoder is simple with only 1 or 2 bytes sequences.
121        decoder = codecs.getincrementaldecoder('cp949')()
122        self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
123                         u'\ud30c\uc774')
124        self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
125                         u'\uc36c \ub9c8\uc744')
126        self.assertEqual(decoder.decode(''), u'')
127
128    def test_dbcs_keep_buffer(self):
129        decoder = codecs.getincrementaldecoder('cp949')()
130        self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
131        self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
132        self.assertEqual(decoder.decode('\xcc'), u'\uc774')
133
134        self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
135        self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
136        self.assertEqual(decoder.decode('\xcc'), u'\uc774')
137
138    def test_iso2022(self):
139        decoder = codecs.getincrementaldecoder('iso2022-jp')()
140        ESC = '\x1b'
141        self.assertEqual(decoder.decode(ESC + '('), u'')
142        self.assertEqual(decoder.decode('B', True), u'')
143        self.assertEqual(decoder.decode(ESC + '$'), u'')
144        self.assertEqual(decoder.decode('B@$'), u'\u4e16')
145        self.assertEqual(decoder.decode('@$@'), u'\u4e16')
146        self.assertEqual(decoder.decode('$', True), u'\u4e16')
147        self.assertEqual(decoder.reset(), None)
148        self.assertEqual(decoder.decode('@$'), u'@$')
149        self.assertEqual(decoder.decode(ESC + '$'), u'')
150        self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
151        self.assertEqual(decoder.decode('B@$'), u'\u4e16')
152
153class Test_StreamReader(unittest.TestCase):
154    def test_bug1728403(self):
155        try:
156            open(TESTFN, 'w').write('\xa1')
157            f = codecs.open(TESTFN, encoding='cp949')
158            self.assertRaises(UnicodeDecodeError, f.read, 2)
159        finally:
160            try: f.close()
161            except: pass
162            os.unlink(TESTFN)
163
164class Test_StreamWriter(unittest.TestCase):
165    @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
166    def test_gb18030(self):
167        s = StringIO.StringIO()
168        c = codecs.getwriter('gb18030')(s)
169        c.write(u'123')
170        self.assertEqual(s.getvalue(), '123')
171        c.write(u'\U00012345')
172        self.assertEqual(s.getvalue(), '123\x907\x959')
173        c.write(u'\U00012345'[0])
174        self.assertEqual(s.getvalue(), '123\x907\x959')
175        c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
176        self.assertEqual(s.getvalue(),
177                '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
178        c.write(u'\U00012345'[0])
179        self.assertEqual(s.getvalue(),
180                '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
181        self.assertRaises(UnicodeError, c.reset)
182        self.assertEqual(s.getvalue(),
183                '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
184
185    @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
186    def test_utf_8(self):
187        s= StringIO.StringIO()
188        c = codecs.getwriter('utf-8')(s)
189        c.write(u'123')
190        self.assertEqual(s.getvalue(), '123')
191        c.write(u'\U00012345')
192        self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
193
194        # Python utf-8 codec can't buffer surrogate pairs yet.
195        if 0:
196            c.write(u'\U00012345'[0])
197            self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
198            c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
199            self.assertEqual(s.getvalue(),
200                '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
201                '\xea\xb0\x80\xc2\xac')
202            c.write(u'\U00012345'[0])
203            self.assertEqual(s.getvalue(),
204                '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
205                '\xea\xb0\x80\xc2\xac')
206            c.reset()
207            self.assertEqual(s.getvalue(),
208                '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
209                '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
210            c.write(u'\U00012345'[1])
211            self.assertEqual(s.getvalue(),
212                '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
213                '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
214
215    def test_streamwriter_strwrite(self):
216        s = StringIO.StringIO()
217        wr = codecs.getwriter('gb18030')(s)
218        wr.write('abcd')
219        self.assertEqual(s.getvalue(), 'abcd')
220
221class Test_ISO2022(unittest.TestCase):
222    def test_g2(self):
223        iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
224        uni = u':hu4:unit\xe9 de famille'
225        self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
226
227    def test_iso2022_jp_g0(self):
228        self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
229        for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
230            e = u'\u3406'.encode(encoding)
231            self.assertFalse(filter(lambda x: x >= '\x80', e))
232
233    def test_bug1572832(self):
234        if sys.maxunicode >= 0x10000:
235            myunichr = unichr
236        else:
237            myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
238
239        for x in xrange(0x10000, 0x110000):
240            # Any ISO 2022 codec will cause the segfault
241            myunichr(x).encode('iso_2022_jp', 'ignore')
242
243class TestStateful(unittest.TestCase):
244    text = u'\u4E16\u4E16'
245    encoding = 'iso-2022-jp'
246    expected = b'\x1b$B@$@$'
247    expected_reset = b'\x1b$B@$@$\x1b(B'
248
249    def test_encode(self):
250        self.assertEqual(self.text.encode(self.encoding), self.expected_reset)
251
252    def test_incrementalencoder(self):
253        encoder = codecs.getincrementalencoder(self.encoding)()
254        output = b''.join(
255            encoder.encode(char)
256            for char in self.text)
257        self.assertEqual(output, self.expected)
258
259    def test_incrementalencoder_final(self):
260        encoder = codecs.getincrementalencoder(self.encoding)()
261        last_index = len(self.text) - 1
262        output = b''.join(
263            encoder.encode(char, index == last_index)
264            for index, char in enumerate(self.text))
265        self.assertEqual(output, self.expected_reset)
266
267class TestHZStateful(TestStateful):
268    text = u'\u804a\u804a'
269    encoding = 'hz'
270    expected = b'~{ADAD'
271    expected_reset = b'~{ADAD~}'
272
273def test_main():
274    test_support.run_unittest(__name__)
275
276if __name__ == "__main__":
277    test_main()
278