• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import test.test_support, unittest
2import sys, codecs, htmlentitydefs, unicodedata
3
4class PosReturn:
5    # this can be used for configurable callbacks
6
7    def __init__(self):
8        self.pos = 0
9
10    def handle(self, exc):
11        oldpos = self.pos
12        realpos = oldpos
13        if realpos<0:
14            realpos = len(exc.object) + realpos
15        # if we don't advance this time, terminate on the next call
16        # otherwise we'd get an endless loop
17        if realpos <= exc.start:
18            self.pos = len(exc.object)
19        return (u"<?>", oldpos)
20
21# A UnicodeEncodeError object with a bad start attribute
22class BadStartUnicodeEncodeError(UnicodeEncodeError):
23    def __init__(self):
24        UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
25        self.start = []
26
27# A UnicodeEncodeError object with a bad object attribute
28class BadObjectUnicodeEncodeError(UnicodeEncodeError):
29    def __init__(self):
30        UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
31        self.object = []
32
33# A UnicodeDecodeError object without an end attribute
34class NoEndUnicodeDecodeError(UnicodeDecodeError):
35    def __init__(self):
36        UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
37        del self.end
38
39# A UnicodeDecodeError object with a bad object attribute
40class BadObjectUnicodeDecodeError(UnicodeDecodeError):
41    def __init__(self):
42        UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
43        self.object = []
44
45# A UnicodeTranslateError object without a start attribute
46class NoStartUnicodeTranslateError(UnicodeTranslateError):
47    def __init__(self):
48        UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
49        del self.start
50
51# A UnicodeTranslateError object without an end attribute
52class NoEndUnicodeTranslateError(UnicodeTranslateError):
53    def __init__(self):
54        UnicodeTranslateError.__init__(self,  u"", 0, 1, "bad")
55        del self.end
56
57# A UnicodeTranslateError object without an object attribute
58class NoObjectUnicodeTranslateError(UnicodeTranslateError):
59    def __init__(self):
60        UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
61        del self.object
62
63class CodecCallbackTest(unittest.TestCase):
64
65    def test_xmlcharrefreplace(self):
66        # replace unencodable characters which numeric character entities.
67        # For ascii, latin-1 and charmaps this is completely implemented
68        # in C and should be reasonably fast.
69        s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
70        self.assertEqual(
71            s.encode("ascii", "xmlcharrefreplace"),
72            "&#12473;&#12497;&#12514; &#228;nd egg&#353;"
73        )
74        self.assertEqual(
75            s.encode("latin-1", "xmlcharrefreplace"),
76            "&#12473;&#12497;&#12514; \xe4nd egg&#353;"
77        )
78        self.assertEqual(
79            s.encode("iso-8859-15", "xmlcharrefreplace"),
80            "&#12473;&#12497;&#12514; \xe4nd egg\xa8"
81        )
82
83    def test_xmlcharrefreplace_with_surrogates(self):
84        tests = [(u'\U0001f49d', '&#128157;'),
85                 (u'\ud83d', '&#55357;'),
86                 (u'\udc9d', '&#56477;'),
87                ]
88        if u'\ud83d\udc9d' != u'\U0001f49d':
89            tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
90        for encoding in ['ascii', 'latin1', 'iso-8859-15']:
91            for s, exp in tests:
92                self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
93                                 exp, msg='%r.encode(%r)' % (s, encoding))
94                self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
95                                 exp+'X',
96                                 msg='%r.encode(%r)' % (s + 'X', encoding))
97
98    def test_xmlcharnamereplace(self):
99        # This time use a named character entity for unencodable
100        # characters, if one is available.
101
102        def xmlcharnamereplace(exc):
103            if not isinstance(exc, UnicodeEncodeError):
104                raise TypeError("don't know how to handle %r" % exc)
105            l = []
106            for c in exc.object[exc.start:exc.end]:
107                try:
108                    l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
109                except KeyError:
110                    l.append(u"&#%d;" % ord(c))
111            return (u"".join(l), exc.end)
112
113        codecs.register_error(
114            "test.xmlcharnamereplace", xmlcharnamereplace)
115
116        sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
117        sout = "&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
118        self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
119        sout = "\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
120        self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
121        sout = "\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
122        self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
123
124    def test_uninamereplace(self):
125        # We're using the names from the unicode database this time,
126        # and we're doing "syntax highlighting" here, i.e. we include
127        # the replaced text in ANSI escape sequences. For this it is
128        # useful that the error handler is not called for every single
129        # unencodable character, but for a complete sequence of
130        # unencodable characters, otherwise we would output many
131        # unnecessary escape sequences.
132
133        def uninamereplace(exc):
134            if not isinstance(exc, UnicodeEncodeError):
135                raise TypeError("don't know how to handle %r" % exc)
136            l = []
137            for c in exc.object[exc.start:exc.end]:
138                l.append(unicodedata.name(c, u"0x%x" % ord(c)))
139            return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
140
141        codecs.register_error(
142            "test.uninamereplace", uninamereplace)
143
144        sin = u"\xac\u1234\u20ac\u8000"
145        sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
146        self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
147
148        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
149        self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
150
151        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
152        self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
153
154    def test_backslashescape(self):
155        # Does the same as the "unicode-escape" encoding, but with different
156        # base encodings.
157        sin = u"a\xac\u1234\u20ac\u8000"
158        if sys.maxunicode > 0xffff:
159            sin += unichr(sys.maxunicode)
160        sout = "a\\xac\\u1234\\u20ac\\u8000"
161        if sys.maxunicode > 0xffff:
162            sout += "\\U%08x" % sys.maxunicode
163        self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
164
165        sout = "a\xac\\u1234\\u20ac\\u8000"
166        if sys.maxunicode > 0xffff:
167            sout += "\\U%08x" % sys.maxunicode
168        self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
169
170        sout = "a\xac\\u1234\xa4\\u8000"
171        if sys.maxunicode > 0xffff:
172            sout += "\\U%08x" % sys.maxunicode
173        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
174
175    def test_decoding_callbacks(self):
176        # This is a test for a decoding callback handler
177        # that allows the decoding of the invalid sequence
178        # "\xc0\x80" and returns "\x00" instead of raising an error.
179        # All other illegal sequences will be handled strictly.
180        def relaxedutf8(exc):
181            if not isinstance(exc, UnicodeDecodeError):
182                raise TypeError("don't know how to handle %r" % exc)
183            if exc.object[exc.start:exc.start+2] == "\xc0\x80":
184                return (u"\x00", exc.start+2) # retry after two bytes
185            else:
186                raise exc
187
188        codecs.register_error("test.relaxedutf8", relaxedutf8)
189
190        # all the "\xc0\x80" will be decoded to "\x00"
191        sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
192        sout = u"a\x00b\x00c\xfc\x00\x00"
193        self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
194
195        # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
196        sin = "\xc0\x80\xc0\x81"
197        self.assertRaises(UnicodeDecodeError, sin.decode,
198                          "utf-8", "test.relaxedutf8")
199
200    def test_charmapencode(self):
201        # For charmap encodings the replacement string will be
202        # mapped through the encoding again. This means, that
203        # to be able to use e.g. the "replace" handler, the
204        # charmap has to have a mapping for "?".
205        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
206        sin = u"abc"
207        sout = "AABBCC"
208        self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
209
210        sin = u"abcA"
211        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
212
213        charmap[ord("?")] = "XYZ"
214        sin = u"abcDEF"
215        sout = "AABBCCXYZXYZXYZ"
216        self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
217
218        charmap[ord("?")] = u"XYZ"
219        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
220
221        charmap[ord("?")] = u"XYZ"
222        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
223
224    def test_decodeunicodeinternal(self):
225        self.assertRaises(
226            UnicodeDecodeError,
227            "\x00\x00\x00\x00\x00".decode,
228            "unicode-internal",
229        )
230        if sys.maxunicode > 0xffff:
231            def handler_unicodeinternal(exc):
232                if not isinstance(exc, UnicodeDecodeError):
233                    raise TypeError("don't know how to handle %r" % exc)
234                return (u"\x01", 1)
235
236            self.assertEqual(
237                "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
238                u"\u0000"
239            )
240
241            self.assertEqual(
242                "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
243                u"\u0000\ufffd"
244            )
245
246            codecs.register_error("test.hui", handler_unicodeinternal)
247
248            self.assertEqual(
249                "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
250                u"\u0000\u0001\u0000"
251            )
252
253    def test_callbacks(self):
254        def handler1(exc):
255            if not isinstance(exc, UnicodeEncodeError) \
256               and not isinstance(exc, UnicodeDecodeError):
257                raise TypeError("don't know how to handle %r" % exc)
258            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
259            return (u"[%s]" % u"".join(l), exc.end)
260
261        codecs.register_error("test.handler1", handler1)
262
263        def handler2(exc):
264            if not isinstance(exc, UnicodeDecodeError):
265                raise TypeError("don't know how to handle %r" % exc)
266            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
267            return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
268
269        codecs.register_error("test.handler2", handler2)
270
271        s = "\x00\x81\x7f\x80\xff"
272
273        self.assertEqual(
274            s.decode("ascii", "test.handler1"),
275            u"\x00[<129>]\x7f[<128>][<255>]"
276        )
277        self.assertEqual(
278            s.decode("ascii", "test.handler2"),
279            u"\x00[<129>][<128>]"
280        )
281
282        self.assertEqual(
283            "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
284            u"\u3042[<92><117><51>]xxx"
285        )
286
287        self.assertEqual(
288            "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
289            u"\u3042[<92><117><51>]xx"
290        )
291
292        self.assertEqual(
293            codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
294            u"z[<98>][<99>]"
295        )
296
297        self.assertEqual(
298            u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
299            u"g[<252><223>]rk"
300        )
301
302        self.assertEqual(
303            u"g\xfc\xdf".encode("ascii", "test.handler1"),
304            u"g[<252><223>]"
305        )
306
307    def test_longstrings(self):
308        # test long strings to check for memory overflow problems
309        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
310                   "backslashreplace"]
311        # register the handlers under different names,
312        # to prevent the codec from recognizing the name
313        for err in errors:
314            codecs.register_error("test." + err, codecs.lookup_error(err))
315        l = 1000
316        errors += [ "test." + err for err in errors ]
317        for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
318            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
319                        "utf-8", "utf-7", "utf-16", "utf-32"):
320                for err in errors:
321                    try:
322                        uni.encode(enc, err)
323                    except UnicodeError:
324                        pass
325
326    def check_exceptionobjectargs(self, exctype, args, msg):
327        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
328        # check with one missing argument
329        self.assertRaises(TypeError, exctype, *args[:-1])
330        # check with one argument too much
331        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
332        # check with one argument of the wrong type
333        wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
334        for i in xrange(len(args)):
335            for wrongarg in wrongargs:
336                if type(wrongarg) is type(args[i]):
337                    continue
338                # build argument array
339                callargs = []
340                for j in xrange(len(args)):
341                    if i==j:
342                        callargs.append(wrongarg)
343                    else:
344                        callargs.append(args[i])
345                self.assertRaises(TypeError, exctype, *callargs)
346
347        # check with the correct number and type of arguments
348        exc = exctype(*args)
349        self.assertEqual(str(exc), msg)
350
351    def test_unicodeencodeerror(self):
352        self.check_exceptionobjectargs(
353            UnicodeEncodeError,
354            ["ascii", u"g\xfcrk", 1, 2, "ouch"],
355            "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
356        )
357        self.check_exceptionobjectargs(
358            UnicodeEncodeError,
359            ["ascii", u"g\xfcrk", 1, 4, "ouch"],
360            "'ascii' codec can't encode characters in position 1-3: ouch"
361        )
362        self.check_exceptionobjectargs(
363            UnicodeEncodeError,
364            ["ascii", u"\xfcx", 0, 1, "ouch"],
365            "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
366        )
367        self.check_exceptionobjectargs(
368            UnicodeEncodeError,
369            ["ascii", u"\u0100x", 0, 1, "ouch"],
370            "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
371        )
372        self.check_exceptionobjectargs(
373            UnicodeEncodeError,
374            ["ascii", u"\uffffx", 0, 1, "ouch"],
375            "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
376        )
377        if sys.maxunicode > 0xffff:
378            self.check_exceptionobjectargs(
379                UnicodeEncodeError,
380                ["ascii", u"\U00010000x", 0, 1, "ouch"],
381                "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
382            )
383
384    def test_unicodedecodeerror(self):
385        self.check_exceptionobjectargs(
386            UnicodeDecodeError,
387            ["ascii", "g\xfcrk", 1, 2, "ouch"],
388            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
389        )
390        self.check_exceptionobjectargs(
391            UnicodeDecodeError,
392            ["ascii", "g\xfcrk", 1, 3, "ouch"],
393            "'ascii' codec can't decode bytes in position 1-2: ouch"
394        )
395
396    def test_unicodetranslateerror(self):
397        self.check_exceptionobjectargs(
398            UnicodeTranslateError,
399            [u"g\xfcrk", 1, 2, "ouch"],
400            "can't translate character u'\\xfc' in position 1: ouch"
401        )
402        self.check_exceptionobjectargs(
403            UnicodeTranslateError,
404            [u"g\u0100rk", 1, 2, "ouch"],
405            "can't translate character u'\\u0100' in position 1: ouch"
406        )
407        self.check_exceptionobjectargs(
408            UnicodeTranslateError,
409            [u"g\uffffrk", 1, 2, "ouch"],
410            "can't translate character u'\\uffff' in position 1: ouch"
411        )
412        if sys.maxunicode > 0xffff:
413            self.check_exceptionobjectargs(
414                UnicodeTranslateError,
415                [u"g\U00010000rk", 1, 2, "ouch"],
416                "can't translate character u'\\U00010000' in position 1: ouch"
417            )
418        self.check_exceptionobjectargs(
419            UnicodeTranslateError,
420            [u"g\xfcrk", 1, 3, "ouch"],
421            "can't translate characters in position 1-2: ouch"
422        )
423
424    def test_badandgoodstrictexceptions(self):
425        # "strict" complains about a non-exception passed in
426        self.assertRaises(
427            TypeError,
428            codecs.strict_errors,
429            42
430        )
431        # "strict" complains about the wrong exception type
432        self.assertRaises(
433            Exception,
434            codecs.strict_errors,
435            Exception("ouch")
436        )
437
438        # If the correct exception is passed in, "strict" raises it
439        self.assertRaises(
440            UnicodeEncodeError,
441            codecs.strict_errors,
442            UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
443        )
444        self.assertRaises(
445            UnicodeDecodeError,
446            codecs.strict_errors,
447            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
448        )
449        self.assertRaises(
450            UnicodeTranslateError,
451            codecs.strict_errors,
452            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
453        )
454
455    def test_badandgoodignoreexceptions(self):
456        # "ignore" complains about a non-exception passed in
457        self.assertRaises(
458           TypeError,
459           codecs.ignore_errors,
460           42
461        )
462        # "ignore" complains about the wrong exception type
463        self.assertRaises(
464           TypeError,
465           codecs.ignore_errors,
466           UnicodeError("ouch")
467        )
468        # If the correct exception is passed in, "ignore" returns an empty replacement
469        self.assertEqual(
470            codecs.ignore_errors(
471                UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
472            (u"", 2)
473        )
474        self.assertEqual(
475            codecs.ignore_errors(
476                UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
477            (u"", 2)
478        )
479        self.assertEqual(
480            codecs.ignore_errors(
481                UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
482            (u"", 2)
483        )
484
485    def test_badandgoodreplaceexceptions(self):
486        # "replace" complains about a non-exception passed in
487        self.assertRaises(
488           TypeError,
489           codecs.replace_errors,
490           42
491        )
492        # "replace" complains about the wrong exception type
493        self.assertRaises(
494           TypeError,
495           codecs.replace_errors,
496           UnicodeError("ouch")
497        )
498        self.assertRaises(
499            TypeError,
500            codecs.replace_errors,
501            BadObjectUnicodeEncodeError()
502        )
503        self.assertRaises(
504            TypeError,
505            codecs.replace_errors,
506            BadObjectUnicodeDecodeError()
507        )
508        # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
509        self.assertEqual(
510            codecs.replace_errors(
511                UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
512            (u"?", 2)
513        )
514        self.assertEqual(
515            codecs.replace_errors(
516                UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
517            (u"\ufffd", 2)
518        )
519        self.assertEqual(
520            codecs.replace_errors(
521                UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
522            (u"\ufffd", 2)
523        )
524
525    def test_badandgoodxmlcharrefreplaceexceptions(self):
526        # "xmlcharrefreplace" complains about a non-exception passed in
527        self.assertRaises(
528           TypeError,
529           codecs.xmlcharrefreplace_errors,
530           42
531        )
532        # "xmlcharrefreplace" complains about the wrong exception types
533        self.assertRaises(
534           TypeError,
535           codecs.xmlcharrefreplace_errors,
536           UnicodeError("ouch")
537        )
538        # "xmlcharrefreplace" can only be used for encoding
539        self.assertRaises(
540            TypeError,
541            codecs.xmlcharrefreplace_errors,
542            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
543        )
544        self.assertRaises(
545            TypeError,
546            codecs.xmlcharrefreplace_errors,
547            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
548        )
549        # Use the correct exception
550        cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000)
551        cs += (0xdfff, 0xd800)
552        s = u"".join(unichr(c) for c in cs)
553        s += u"\U0001869f\U000186a0\U000f423f\U000f4240"
554        cs += (99999, 100000, 999999, 1000000)
555        self.assertEqual(
556            codecs.xmlcharrefreplace_errors(
557                UnicodeEncodeError("ascii", u"a" + s + u"b",
558                                   1, 1 + len(s), "ouch")
559            ),
560            (u"".join(u"&#%d;" % c for c in cs), 1 + len(s))
561        )
562
563    def test_badandgoodbackslashreplaceexceptions(self):
564        # "backslashreplace" complains about a non-exception passed in
565        self.assertRaises(
566           TypeError,
567           codecs.backslashreplace_errors,
568           42
569        )
570        # "backslashreplace" complains about the wrong exception types
571        self.assertRaises(
572           TypeError,
573           codecs.backslashreplace_errors,
574           UnicodeError("ouch")
575        )
576        # "backslashreplace" can only be used for encoding
577        self.assertRaises(
578            TypeError,
579            codecs.backslashreplace_errors,
580            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
581        )
582        self.assertRaises(
583            TypeError,
584            codecs.backslashreplace_errors,
585            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
586        )
587        # Use the correct exception
588        tests = [
589            (u"\u3042", u"\\u3042"),
590            (u"\n", u"\\x0a"),
591            (u"a", u"\\x61"),
592            (u"\x00", u"\\x00"),
593            (u"\xff", u"\\xff"),
594            (u"\u0100", u"\\u0100"),
595            (u"\uffff", u"\\uffff"),
596            # Lone surrogates
597            (u"\ud800", u"\\ud800"),
598            (u"\udfff", u"\\udfff"),
599        ]
600        if sys.maxunicode > 0xffff:
601            tests += [
602                (u"\U00010000", u"\\U00010000"),
603                (u"\U0010ffff", u"\\U0010ffff"),
604            ]
605        else:
606            tests += [
607                (u"\U00010000", u"\\ud800\\udc00"),
608                (u"\U0010ffff", u"\\udbff\\udfff"),
609            ]
610        for s, r in tests:
611            self.assertEqual(
612                codecs.backslashreplace_errors(
613                    UnicodeEncodeError("ascii", u"a" + s + u"b",
614                                       1, 1 + len(s), "ouch")),
615                (r, 1 + len(s))
616            )
617
618    def test_badhandlerresults(self):
619        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
620        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
621
622        for res in results:
623            codecs.register_error("test.badhandler", lambda x: res)
624            for enc in encs:
625                self.assertRaises(
626                    TypeError,
627                    u"\u3042".encode,
628                    enc,
629                    "test.badhandler"
630                )
631            for (enc, bytes) in (
632                ("ascii", "\xff"),
633                ("utf-8", "\xff"),
634                ("utf-7", "+x-"),
635                ("unicode-internal", "\x00"),
636            ):
637                self.assertRaises(
638                    TypeError,
639                    bytes.decode,
640                    enc,
641                    "test.badhandler"
642                )
643
644    def test_lookup(self):
645        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
646        self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
647        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
648        self.assertEqual(
649            codecs.xmlcharrefreplace_errors,
650            codecs.lookup_error("xmlcharrefreplace")
651        )
652        self.assertEqual(
653            codecs.backslashreplace_errors,
654            codecs.lookup_error("backslashreplace")
655        )
656
657    def test_unencodablereplacement(self):
658        def unencrepl(exc):
659            if isinstance(exc, UnicodeEncodeError):
660                return (u"\u4242", exc.end)
661            else:
662                raise TypeError("don't know how to handle %r" % exc)
663        codecs.register_error("test.unencreplhandler", unencrepl)
664        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
665            self.assertRaises(
666                UnicodeEncodeError,
667                u"\u4242".encode,
668                enc,
669                "test.unencreplhandler"
670            )
671
672    def test_badregistercall(self):
673        # enhance coverage of:
674        # Modules/_codecsmodule.c::register_error()
675        # Python/codecs.c::PyCodec_RegisterError()
676        self.assertRaises(TypeError, codecs.register_error, 42)
677        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
678
679    def test_badlookupcall(self):
680        # enhance coverage of:
681        # Modules/_codecsmodule.c::lookup_error()
682        self.assertRaises(TypeError, codecs.lookup_error)
683
684    def test_unknownhandler(self):
685        # enhance coverage of:
686        # Modules/_codecsmodule.c::lookup_error()
687        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
688
689    def test_xmlcharrefvalues(self):
690        # enhance coverage of:
691        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
692        # and inline implementations
693        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
694        if sys.maxunicode>=100000:
695            v += (100000, 500000, 1000000)
696        s = u"".join([unichr(x) for x in v])
697        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
698        for enc in ("ascii", "iso-8859-15"):
699            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
700                s.encode(enc, err)
701
702    def test_decodehelper(self):
703        # enhance coverage of:
704        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
705        # and callers
706        self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown")
707
708        def baddecodereturn1(exc):
709            return 42
710        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
711        self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
712        self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
713        self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
714        self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
715        self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
716        self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
717
718        def baddecodereturn2(exc):
719            return (u"?", None)
720        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
721        self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
722
723        handler = PosReturn()
724        codecs.register_error("test.posreturn", handler.handle)
725
726        # Valid negative position
727        handler.pos = -1
728        self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
729
730        # Valid negative position
731        handler.pos = -2
732        self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
733
734        # Negative position out of bounds
735        handler.pos = -3
736        self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
737
738        # Valid positive position
739        handler.pos = 1
740        self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
741
742        # Largest valid positive position (one beyond end of input)
743        handler.pos = 2
744        self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>")
745
746        # Invalid positive position
747        handler.pos = 3
748        self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
749
750        # Restart at the "0"
751        handler.pos = 6
752        self.assertEqual("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
753
754        class D(dict):
755            def __getitem__(self, key):
756                raise ValueError
757        self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
758        self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
759        self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: 0x110000})
760
761    def test_encodehelper(self):
762        # enhance coverage of:
763        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
764        # and callers
765        self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown")
766
767        def badencodereturn1(exc):
768            return 42
769        codecs.register_error("test.badencodereturn1", badencodereturn1)
770        self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1")
771
772        def badencodereturn2(exc):
773            return (u"?", None)
774        codecs.register_error("test.badencodereturn2", badencodereturn2)
775        self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
776
777        handler = PosReturn()
778        codecs.register_error("test.posreturn", handler.handle)
779
780        # Valid negative position
781        handler.pos = -1
782        self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
783
784        # Valid negative position
785        handler.pos = -2
786        self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
787
788        # Negative position out of bounds
789        handler.pos = -3
790        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
791
792        # Valid positive position
793        handler.pos = 1
794        self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
795
796        # Largest valid positive position (one beyond end of input
797        handler.pos = 2
798        self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
799
800        # Invalid positive position
801        handler.pos = 3
802        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
803
804        handler.pos = 0
805
806        class D(dict):
807            def __getitem__(self, key):
808                raise ValueError
809        for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
810            self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
811            self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
812            self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
813
814    def test_translatehelper(self):
815        # enhance coverage of:
816        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
817        # and callers
818        # (Unfortunately the errors argument is not directly accessible
819        # from Python, so we can't test that much)
820        class D(dict):
821            def __getitem__(self, key):
822                raise ValueError
823        self.assertRaises(ValueError, u"\xff".translate, D())
824        self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
825        self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
826
827    def test_bug828737(self):
828        charmap = {
829            ord("&"): u"&amp;",
830            ord("<"): u"&lt;",
831            ord(">"): u"&gt;",
832            ord('"'): u"&quot;",
833        }
834
835        for n in (1, 10, 100, 1000):
836            text = u'abc<def>ghi'*n
837            text.translate(charmap)
838
839    def test_fake_error_class(self):
840        handlers = [
841            codecs.strict_errors,
842            codecs.ignore_errors,
843            codecs.replace_errors,
844            codecs.backslashreplace_errors,
845            codecs.xmlcharrefreplace_errors,
846        ]
847        for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
848            class FakeUnicodeError(str):
849                __class__ = cls
850            for handler in handlers:
851                self.assertRaises(TypeError, handler, FakeUnicodeError())
852            class FakeUnicodeError(Exception):
853                __class__ = cls
854            for handler in handlers:
855                with self.assertRaises((TypeError, FakeUnicodeError)):
856                    handler(FakeUnicodeError())
857
858
859def test_main():
860    test.test_support.run_unittest(CodecCallbackTest)
861
862if __name__ == "__main__":
863    test_main()
864