• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import codecs
2import html.entities
3import itertools
4import sys
5import unicodedata
6import unittest
7
8
9class PosReturn:
10    # this can be used for configurable callbacks
11
12    def __init__(self):
13        self.pos = 0
14
15    def handle(self, exc):
16        oldpos = self.pos
17        realpos = oldpos
18        if realpos<0:
19            realpos = len(exc.object) + realpos
20        # if we don't advance this time, terminate on the next call
21        # otherwise we'd get an endless loop
22        if realpos <= exc.start:
23            self.pos = len(exc.object)
24        return ("<?>", oldpos)
25
26class RepeatedPosReturn:
27    def __init__(self, repl="<?>"):
28        self.repl = repl
29        self.pos = 0
30        self.count = 0
31
32    def handle(self, exc):
33        if self.count > 0:
34            self.count -= 1
35            return (self.repl, self.pos)
36        return (self.repl, exc.end)
37
38# A UnicodeEncodeError object with a bad start attribute
39class BadStartUnicodeEncodeError(UnicodeEncodeError):
40    def __init__(self):
41        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
42        self.start = []
43
44# A UnicodeEncodeError object with a bad object attribute
45class BadObjectUnicodeEncodeError(UnicodeEncodeError):
46    def __init__(self):
47        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
48        self.object = []
49
50# A UnicodeDecodeError object without an end attribute
51class NoEndUnicodeDecodeError(UnicodeDecodeError):
52    def __init__(self):
53        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
54        del self.end
55
56# A UnicodeDecodeError object with a bad object attribute
57class BadObjectUnicodeDecodeError(UnicodeDecodeError):
58    def __init__(self):
59        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
60        self.object = []
61
62# A UnicodeTranslateError object without a start attribute
63class NoStartUnicodeTranslateError(UnicodeTranslateError):
64    def __init__(self):
65        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
66        del self.start
67
68# A UnicodeTranslateError object without an end attribute
69class NoEndUnicodeTranslateError(UnicodeTranslateError):
70    def __init__(self):
71        UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
72        del self.end
73
74# A UnicodeTranslateError object without an object attribute
75class NoObjectUnicodeTranslateError(UnicodeTranslateError):
76    def __init__(self):
77        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
78        del self.object
79
80class CodecCallbackTest(unittest.TestCase):
81
82    def test_xmlcharrefreplace(self):
83        # replace unencodable characters which numeric character entities.
84        # For ascii, latin-1 and charmaps this is completely implemented
85        # in C and should be reasonably fast.
86        s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
87        self.assertEqual(
88            s.encode("ascii", "xmlcharrefreplace"),
89            b"&#12473;&#12497;&#12514; &#228;nd eggs"
90        )
91        self.assertEqual(
92            s.encode("latin-1", "xmlcharrefreplace"),
93            b"&#12473;&#12497;&#12514; \xe4nd eggs"
94        )
95
96    def test_xmlcharnamereplace(self):
97        # This time use a named character entity for unencodable
98        # characters, if one is available.
99
100        def xmlcharnamereplace(exc):
101            if not isinstance(exc, UnicodeEncodeError):
102                raise TypeError("don't know how to handle %r" % exc)
103            l = []
104            for c in exc.object[exc.start:exc.end]:
105                try:
106                    l.append("&%s;" % html.entities.codepoint2name[ord(c)])
107                except KeyError:
108                    l.append("&#%d;" % ord(c))
109            return ("".join(l), exc.end)
110
111        codecs.register_error(
112            "test.xmlcharnamereplace", xmlcharnamereplace)
113
114        sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
115        sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
116        self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
117        sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
118        self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
119        sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
120        self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
121
122    def test_uninamereplace(self):
123        # We're using the names from the unicode database this time,
124        # and we're doing "syntax highlighting" here, i.e. we include
125        # the replaced text in ANSI escape sequences. For this it is
126        # useful that the error handler is not called for every single
127        # unencodable character, but for a complete sequence of
128        # unencodable characters, otherwise we would output many
129        # unnecessary escape sequences.
130
131        def uninamereplace(exc):
132            if not isinstance(exc, UnicodeEncodeError):
133                raise TypeError("don't know how to handle %r" % exc)
134            l = []
135            for c in exc.object[exc.start:exc.end]:
136                l.append(unicodedata.name(c, "0x%x" % ord(c)))
137            return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
138
139        codecs.register_error(
140            "test.uninamereplace", uninamereplace)
141
142        sin = "\xac\u1234\u20ac\u8000"
143        sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
144        self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
145
146        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
147        self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
148
149        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
150        self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
151
152    def test_backslashescape(self):
153        # Does the same as the "unicode-escape" encoding, but with different
154        # base encodings.
155        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
156        sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
157        self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
158
159        sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
160        self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
161
162        sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
163        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
164
165    def test_nameescape(self):
166        # Does the same as backslashescape, but prefers ``\N{...}`` escape
167        # sequences.
168        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
169        sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
170                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
171        self.assertEqual(sin.encode("ascii", "namereplace"), sout)
172
173        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
174                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
175        self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
176
177        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
178                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
179        self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
180
181    def test_decoding_callbacks(self):
182        # This is a test for a decoding callback handler
183        # that allows the decoding of the invalid sequence
184        # "\xc0\x80" and returns "\x00" instead of raising an error.
185        # All other illegal sequences will be handled strictly.
186        def relaxedutf8(exc):
187            if not isinstance(exc, UnicodeDecodeError):
188                raise TypeError("don't know how to handle %r" % exc)
189            if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
190                return ("\x00", exc.start+2) # retry after two bytes
191            else:
192                raise exc
193
194        codecs.register_error("test.relaxedutf8", relaxedutf8)
195
196        # all the "\xc0\x80" will be decoded to "\x00"
197        sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
198        sout = "a\x00b\x00c\xfc\x00\x00"
199        self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
200
201        # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
202        sin = b"\xc0\x80\xc0\x81"
203        self.assertRaises(UnicodeDecodeError, sin.decode,
204                          "utf-8", "test.relaxedutf8")
205
206    def test_charmapencode(self):
207        # For charmap encodings the replacement string will be
208        # mapped through the encoding again. This means, that
209        # to be able to use e.g. the "replace" handler, the
210        # charmap has to have a mapping for "?".
211        charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
212        sin = "abc"
213        sout = b"AABBCC"
214        self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
215
216        sin = "abcA"
217        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
218
219        charmap[ord("?")] = b"XYZ"
220        sin = "abcDEF"
221        sout = b"AABBCCXYZXYZXYZ"
222        self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
223
224        charmap[ord("?")] = "XYZ" # wrong type in mapping
225        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
226
227    def test_callbacks(self):
228        def handler1(exc):
229            r = range(exc.start, exc.end)
230            if isinstance(exc, UnicodeEncodeError):
231                l = ["<%d>" % ord(exc.object[pos]) for pos in r]
232            elif isinstance(exc, UnicodeDecodeError):
233                l = ["<%d>" % exc.object[pos] for pos in r]
234            else:
235                raise TypeError("don't know how to handle %r" % exc)
236            return ("[%s]" % "".join(l), exc.end)
237
238        codecs.register_error("test.handler1", handler1)
239
240        def handler2(exc):
241            if not isinstance(exc, UnicodeDecodeError):
242                raise TypeError("don't know how to handle %r" % exc)
243            l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
244            return ("[%s]" % "".join(l), exc.end+1) # skip one character
245
246        codecs.register_error("test.handler2", handler2)
247
248        s = b"\x00\x81\x7f\x80\xff"
249
250        self.assertEqual(
251            s.decode("ascii", "test.handler1"),
252            "\x00[<129>]\x7f[<128>][<255>]"
253        )
254        self.assertEqual(
255            s.decode("ascii", "test.handler2"),
256            "\x00[<129>][<128>]"
257        )
258
259        self.assertEqual(
260            b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
261            "\u3042[<92><117><51>]xxx"
262        )
263
264        self.assertEqual(
265            b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
266            "\u3042[<92><117><51>]xx"
267        )
268
269        self.assertEqual(
270            codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
271            "z[<98>][<99>]"
272        )
273
274        self.assertEqual(
275            "g\xfc\xdfrk".encode("ascii", "test.handler1"),
276            b"g[<252><223>]rk"
277        )
278
279        self.assertEqual(
280            "g\xfc\xdf".encode("ascii", "test.handler1"),
281            b"g[<252><223>]"
282        )
283
284    def test_longstrings(self):
285        # test long strings to check for memory overflow problems
286        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
287                   "backslashreplace", "namereplace"]
288        # register the handlers under different names,
289        # to prevent the codec from recognizing the name
290        for err in errors:
291            codecs.register_error("test." + err, codecs.lookup_error(err))
292        l = 1000
293        errors += [ "test." + err for err in errors ]
294        for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
295            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
296                        "utf-8", "utf-7", "utf-16", "utf-32"):
297                for err in errors:
298                    try:
299                        uni.encode(enc, err)
300                    except UnicodeError:
301                        pass
302
303    def check_exceptionobjectargs(self, exctype, args, msg):
304        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
305        # check with one missing argument
306        self.assertRaises(TypeError, exctype, *args[:-1])
307        # check with one argument too much
308        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
309        # check with one argument of the wrong type
310        wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
311        for i in range(len(args)):
312            for wrongarg in wrongargs:
313                if type(wrongarg) is type(args[i]):
314                    continue
315                # build argument array
316                callargs = []
317                for j in range(len(args)):
318                    if i==j:
319                        callargs.append(wrongarg)
320                    else:
321                        callargs.append(args[i])
322                self.assertRaises(TypeError, exctype, *callargs)
323
324        # check with the correct number and type of arguments
325        exc = exctype(*args)
326        self.assertEqual(str(exc), msg)
327
328    def test_unicodeencodeerror(self):
329        self.check_exceptionobjectargs(
330            UnicodeEncodeError,
331            ["ascii", "g\xfcrk", 1, 2, "ouch"],
332            "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
333        )
334        self.check_exceptionobjectargs(
335            UnicodeEncodeError,
336            ["ascii", "g\xfcrk", 1, 4, "ouch"],
337            "'ascii' codec can't encode characters in position 1-3: ouch"
338        )
339        self.check_exceptionobjectargs(
340            UnicodeEncodeError,
341            ["ascii", "\xfcx", 0, 1, "ouch"],
342            "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
343        )
344        self.check_exceptionobjectargs(
345            UnicodeEncodeError,
346            ["ascii", "\u0100x", 0, 1, "ouch"],
347            "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
348        )
349        self.check_exceptionobjectargs(
350            UnicodeEncodeError,
351            ["ascii", "\uffffx", 0, 1, "ouch"],
352            "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
353        )
354        self.check_exceptionobjectargs(
355            UnicodeEncodeError,
356            ["ascii", "\U00010000x", 0, 1, "ouch"],
357            "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
358        )
359
360    def test_unicodedecodeerror(self):
361        self.check_exceptionobjectargs(
362            UnicodeDecodeError,
363            ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
364            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
365        )
366        self.check_exceptionobjectargs(
367            UnicodeDecodeError,
368            ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
369            "'ascii' codec can't decode bytes in position 1-2: ouch"
370        )
371
372    def test_unicodetranslateerror(self):
373        self.check_exceptionobjectargs(
374            UnicodeTranslateError,
375            ["g\xfcrk", 1, 2, "ouch"],
376            "can't translate character '\\xfc' in position 1: ouch"
377        )
378        self.check_exceptionobjectargs(
379            UnicodeTranslateError,
380            ["g\u0100rk", 1, 2, "ouch"],
381            "can't translate character '\\u0100' in position 1: ouch"
382        )
383        self.check_exceptionobjectargs(
384            UnicodeTranslateError,
385            ["g\uffffrk", 1, 2, "ouch"],
386            "can't translate character '\\uffff' in position 1: ouch"
387        )
388        self.check_exceptionobjectargs(
389            UnicodeTranslateError,
390            ["g\U00010000rk", 1, 2, "ouch"],
391            "can't translate character '\\U00010000' in position 1: ouch"
392        )
393        self.check_exceptionobjectargs(
394            UnicodeTranslateError,
395            ["g\xfcrk", 1, 3, "ouch"],
396            "can't translate characters in position 1-2: ouch"
397        )
398
399    def test_badandgoodstrictexceptions(self):
400        # "strict" complains about a non-exception passed in
401        self.assertRaises(
402            TypeError,
403            codecs.strict_errors,
404            42
405        )
406        # "strict" complains about the wrong exception type
407        self.assertRaises(
408            Exception,
409            codecs.strict_errors,
410            Exception("ouch")
411        )
412
413        # If the correct exception is passed in, "strict" raises it
414        self.assertRaises(
415            UnicodeEncodeError,
416            codecs.strict_errors,
417            UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
418        )
419        self.assertRaises(
420            UnicodeDecodeError,
421            codecs.strict_errors,
422            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
423        )
424        self.assertRaises(
425            UnicodeTranslateError,
426            codecs.strict_errors,
427            UnicodeTranslateError("\u3042", 0, 1, "ouch")
428        )
429
430    def test_badandgoodignoreexceptions(self):
431        # "ignore" complains about a non-exception passed in
432        self.assertRaises(
433           TypeError,
434           codecs.ignore_errors,
435           42
436        )
437        # "ignore" complains about the wrong exception type
438        self.assertRaises(
439           TypeError,
440           codecs.ignore_errors,
441           UnicodeError("ouch")
442        )
443        # If the correct exception is passed in, "ignore" returns an empty replacement
444        self.assertEqual(
445            codecs.ignore_errors(
446                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
447            ("", 2)
448        )
449        self.assertEqual(
450            codecs.ignore_errors(
451                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
452            ("", 2)
453        )
454        self.assertEqual(
455            codecs.ignore_errors(
456                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
457            ("", 2)
458        )
459
460    def test_badandgoodreplaceexceptions(self):
461        # "replace" complains about a non-exception passed in
462        self.assertRaises(
463           TypeError,
464           codecs.replace_errors,
465           42
466        )
467        # "replace" complains about the wrong exception type
468        self.assertRaises(
469           TypeError,
470           codecs.replace_errors,
471           UnicodeError("ouch")
472        )
473        self.assertRaises(
474            TypeError,
475            codecs.replace_errors,
476            BadObjectUnicodeEncodeError()
477        )
478        self.assertRaises(
479            TypeError,
480            codecs.replace_errors,
481            BadObjectUnicodeDecodeError()
482        )
483        # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
484        self.assertEqual(
485            codecs.replace_errors(
486                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
487            ("?", 2)
488        )
489        self.assertEqual(
490            codecs.replace_errors(
491                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
492            ("\ufffd", 2)
493        )
494        self.assertEqual(
495            codecs.replace_errors(
496                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
497            ("\ufffd", 2)
498        )
499
500    def test_badandgoodxmlcharrefreplaceexceptions(self):
501        # "xmlcharrefreplace" complains about a non-exception passed in
502        self.assertRaises(
503           TypeError,
504           codecs.xmlcharrefreplace_errors,
505           42
506        )
507        # "xmlcharrefreplace" complains about the wrong exception types
508        self.assertRaises(
509           TypeError,
510           codecs.xmlcharrefreplace_errors,
511           UnicodeError("ouch")
512        )
513        # "xmlcharrefreplace" can only be used for encoding
514        self.assertRaises(
515            TypeError,
516            codecs.xmlcharrefreplace_errors,
517            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
518        )
519        self.assertRaises(
520            TypeError,
521            codecs.xmlcharrefreplace_errors,
522            UnicodeTranslateError("\u3042", 0, 1, "ouch")
523        )
524        # Use the correct exception
525        cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
526              999999, 1000000)
527        cs += (0xd800, 0xdfff)
528        s = "".join(chr(c) for c in cs)
529        self.assertEqual(
530            codecs.xmlcharrefreplace_errors(
531                UnicodeEncodeError("ascii", "a" + s + "b",
532                                   1, 1 + len(s), "ouch")
533            ),
534            ("".join("&#%d;" % c for c in cs), 1 + len(s))
535        )
536
537    def test_badandgoodbackslashreplaceexceptions(self):
538        # "backslashreplace" complains about a non-exception passed in
539        self.assertRaises(
540           TypeError,
541           codecs.backslashreplace_errors,
542           42
543        )
544        # "backslashreplace" complains about the wrong exception types
545        self.assertRaises(
546           TypeError,
547           codecs.backslashreplace_errors,
548           UnicodeError("ouch")
549        )
550        # Use the correct exception
551        tests = [
552            ("\u3042", "\\u3042"),
553            ("\n", "\\x0a"),
554            ("a", "\\x61"),
555            ("\x00", "\\x00"),
556            ("\xff", "\\xff"),
557            ("\u0100", "\\u0100"),
558            ("\uffff", "\\uffff"),
559            ("\U00010000", "\\U00010000"),
560            ("\U0010ffff", "\\U0010ffff"),
561            # Lone surrogates
562            ("\ud800", "\\ud800"),
563            ("\udfff", "\\udfff"),
564            ("\ud800\udfff", "\\ud800\\udfff"),
565        ]
566        for s, r in tests:
567            with self.subTest(str=s):
568                self.assertEqual(
569                    codecs.backslashreplace_errors(
570                        UnicodeEncodeError("ascii", "a" + s + "b",
571                                           1, 1 + len(s), "ouch")),
572                    (r, 1 + len(s))
573                )
574                self.assertEqual(
575                    codecs.backslashreplace_errors(
576                        UnicodeTranslateError("a" + s + "b",
577                                              1, 1 + len(s), "ouch")),
578                    (r, 1 + len(s))
579                )
580        tests = [
581            (b"a", "\\x61"),
582            (b"\n", "\\x0a"),
583            (b"\x00", "\\x00"),
584            (b"\xff", "\\xff"),
585        ]
586        for b, r in tests:
587            with self.subTest(bytes=b):
588                self.assertEqual(
589                    codecs.backslashreplace_errors(
590                        UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
591                                           1, 2, "ouch")),
592                    (r, 2)
593                )
594
595    def test_badandgoodnamereplaceexceptions(self):
596        # "namereplace" complains about a non-exception passed in
597        self.assertRaises(
598           TypeError,
599           codecs.namereplace_errors,
600           42
601        )
602        # "namereplace" complains about the wrong exception types
603        self.assertRaises(
604           TypeError,
605           codecs.namereplace_errors,
606           UnicodeError("ouch")
607        )
608        # "namereplace" can only be used for encoding
609        self.assertRaises(
610            TypeError,
611            codecs.namereplace_errors,
612            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
613        )
614        self.assertRaises(
615            TypeError,
616            codecs.namereplace_errors,
617            UnicodeTranslateError("\u3042", 0, 1, "ouch")
618        )
619        # Use the correct exception
620        tests = [
621            ("\u3042", "\\N{HIRAGANA LETTER A}"),
622            ("\x00", "\\x00"),
623            ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
624                       "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
625            ("\U000e007f", "\\N{CANCEL TAG}"),
626            ("\U0010ffff", "\\U0010ffff"),
627            # Lone surrogates
628            ("\ud800", "\\ud800"),
629            ("\udfff", "\\udfff"),
630            ("\ud800\udfff", "\\ud800\\udfff"),
631        ]
632        for s, r in tests:
633            with self.subTest(str=s):
634                self.assertEqual(
635                    codecs.namereplace_errors(
636                        UnicodeEncodeError("ascii", "a" + s + "b",
637                                           1, 1 + len(s), "ouch")),
638                    (r, 1 + len(s))
639                )
640
641    def test_badandgoodsurrogateescapeexceptions(self):
642        surrogateescape_errors = codecs.lookup_error('surrogateescape')
643        # "surrogateescape" complains about a non-exception passed in
644        self.assertRaises(
645           TypeError,
646           surrogateescape_errors,
647           42
648        )
649        # "surrogateescape" complains about the wrong exception types
650        self.assertRaises(
651           TypeError,
652           surrogateescape_errors,
653           UnicodeError("ouch")
654        )
655        # "surrogateescape" can not be used for translating
656        self.assertRaises(
657            TypeError,
658            surrogateescape_errors,
659            UnicodeTranslateError("\udc80", 0, 1, "ouch")
660        )
661        # Use the correct exception
662        for s in ("a", "\udc7f", "\udd00"):
663            with self.subTest(str=s):
664                self.assertRaises(
665                    UnicodeEncodeError,
666                    surrogateescape_errors,
667                    UnicodeEncodeError("ascii", s, 0, 1, "ouch")
668                )
669        self.assertEqual(
670            surrogateescape_errors(
671                UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
672            (b"\x80", 2)
673        )
674        self.assertRaises(
675            UnicodeDecodeError,
676            surrogateescape_errors,
677            UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
678        )
679        self.assertEqual(
680            surrogateescape_errors(
681                UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
682            ("\udc80", 2)
683        )
684
685    def test_badandgoodsurrogatepassexceptions(self):
686        surrogatepass_errors = codecs.lookup_error('surrogatepass')
687        # "surrogatepass" complains about a non-exception passed in
688        self.assertRaises(
689           TypeError,
690           surrogatepass_errors,
691           42
692        )
693        # "surrogatepass" complains about the wrong exception types
694        self.assertRaises(
695           TypeError,
696           surrogatepass_errors,
697           UnicodeError("ouch")
698        )
699        # "surrogatepass" can not be used for translating
700        self.assertRaises(
701            TypeError,
702            surrogatepass_errors,
703            UnicodeTranslateError("\ud800", 0, 1, "ouch")
704        )
705        # Use the correct exception
706        for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
707            with self.subTest(encoding=enc):
708                self.assertRaises(
709                    UnicodeEncodeError,
710                    surrogatepass_errors,
711                    UnicodeEncodeError(enc, "a", 0, 1, "ouch")
712                )
713                self.assertRaises(
714                    UnicodeDecodeError,
715                    surrogatepass_errors,
716                    UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
717                )
718        for s in ("\ud800", "\udfff", "\ud800\udfff"):
719            with self.subTest(str=s):
720                self.assertRaises(
721                    UnicodeEncodeError,
722                    surrogatepass_errors,
723                    UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
724                )
725        tests = [
726            ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
727            ("utf-16le", "\ud800", b'\x00\xd8', 2),
728            ("utf-16be", "\ud800", b'\xd8\x00', 2),
729            ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
730            ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
731            ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
732            ("utf-16le", "\udfff", b'\xff\xdf', 2),
733            ("utf-16be", "\udfff", b'\xdf\xff', 2),
734            ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
735            ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
736            ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
737            ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
738            ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
739            ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
740            ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
741        ]
742        for enc, s, b, n in tests:
743            with self.subTest(encoding=enc, str=s, bytes=b):
744                self.assertEqual(
745                    surrogatepass_errors(
746                        UnicodeEncodeError(enc, "a" + s + "b",
747                                           1, 1 + len(s), "ouch")),
748                    (b, 1 + len(s))
749                )
750                self.assertEqual(
751                    surrogatepass_errors(
752                        UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
753                                           1, 1 + n, "ouch")),
754                    (s[:1], 1 + n)
755                )
756
757    def test_badhandlerresults(self):
758        results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
759        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
760
761        for res in results:
762            codecs.register_error("test.badhandler", lambda x: res)
763            for enc in encs:
764                self.assertRaises(
765                    TypeError,
766                    "\u3042".encode,
767                    enc,
768                    "test.badhandler"
769                )
770            for (enc, bytes) in (
771                ("ascii", b"\xff"),
772                ("utf-8", b"\xff"),
773                ("utf-7", b"+x-"),
774            ):
775                self.assertRaises(
776                    TypeError,
777                    bytes.decode,
778                    enc,
779                    "test.badhandler"
780                )
781
782    def test_lookup(self):
783        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
784        self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
785        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
786        self.assertEqual(
787            codecs.xmlcharrefreplace_errors,
788            codecs.lookup_error("xmlcharrefreplace")
789        )
790        self.assertEqual(
791            codecs.backslashreplace_errors,
792            codecs.lookup_error("backslashreplace")
793        )
794        self.assertEqual(
795            codecs.namereplace_errors,
796            codecs.lookup_error("namereplace")
797        )
798
799    def test_encode_nonascii_replacement(self):
800        def handle(exc):
801            if isinstance(exc, UnicodeEncodeError):
802                return (repl, exc.end)
803            raise TypeError("don't know how to handle %r" % exc)
804        codecs.register_error("test.replacing", handle)
805
806        for enc, input, repl in (
807                ("ascii", "[¤]", "abc"),
808                ("iso-8859-1", "[€]", "½¾"),
809                ("iso-8859-15", "[¤]", "œŸ"),
810        ):
811            res = input.encode(enc, "test.replacing")
812            self.assertEqual(res, ("[" + repl + "]").encode(enc))
813
814        for enc, input, repl in (
815                ("utf-8", "[\udc80]", "\U0001f40d"),
816                ("utf-16", "[\udc80]", "\U0001f40d"),
817                ("utf-32", "[\udc80]", "\U0001f40d"),
818        ):
819            with self.subTest(encoding=enc):
820                with self.assertRaises(UnicodeEncodeError) as cm:
821                    input.encode(enc, "test.replacing")
822                exc = cm.exception
823                self.assertEqual(exc.start, 1)
824                self.assertEqual(exc.end, 2)
825                self.assertEqual(exc.object, input)
826
827    def test_encode_unencodable_replacement(self):
828        def unencrepl(exc):
829            if isinstance(exc, UnicodeEncodeError):
830                return (repl, exc.end)
831            else:
832                raise TypeError("don't know how to handle %r" % exc)
833        codecs.register_error("test.unencreplhandler", unencrepl)
834
835        for enc, input, repl in (
836                ("ascii", "[¤]", "½"),
837                ("iso-8859-1", "[€]", "œ"),
838                ("iso-8859-15", "[¤]", "½"),
839                ("utf-8", "[\udc80]", "\udcff"),
840                ("utf-16", "[\udc80]", "\udcff"),
841                ("utf-32", "[\udc80]", "\udcff"),
842        ):
843            with self.subTest(encoding=enc):
844                with self.assertRaises(UnicodeEncodeError) as cm:
845                    input.encode(enc, "test.unencreplhandler")
846                exc = cm.exception
847                self.assertEqual(exc.start, 1)
848                self.assertEqual(exc.end, 2)
849                self.assertEqual(exc.object, input)
850
851    def test_encode_bytes_replacement(self):
852        def handle(exc):
853            if isinstance(exc, UnicodeEncodeError):
854                return (repl, exc.end)
855            raise TypeError("don't know how to handle %r" % exc)
856        codecs.register_error("test.replacing", handle)
857
858        # It works even if the bytes sequence is not decodable.
859        for enc, input, repl in (
860                ("ascii", "[¤]", b"\xbd\xbe"),
861                ("iso-8859-1", "[€]", b"\xbd\xbe"),
862                ("iso-8859-15", "[¤]", b"\xbd\xbe"),
863                ("utf-8", "[\udc80]", b"\xbd\xbe"),
864                ("utf-16le", "[\udc80]", b"\xbd\xbe"),
865                ("utf-16be", "[\udc80]", b"\xbd\xbe"),
866                ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
867                ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
868        ):
869            with self.subTest(encoding=enc):
870                res = input.encode(enc, "test.replacing")
871                self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
872
873    def test_encode_odd_bytes_replacement(self):
874        def handle(exc):
875            if isinstance(exc, UnicodeEncodeError):
876                return (repl, exc.end)
877            raise TypeError("don't know how to handle %r" % exc)
878        codecs.register_error("test.replacing", handle)
879
880        input = "[\udc80]"
881        # Tests in which the replacement bytestring contains not whole number
882        # of code units.
883        for enc, repl in (
884            *itertools.product(("utf-16le", "utf-16be"),
885                               [b"a", b"abc"]),
886            *itertools.product(("utf-32le", "utf-32be"),
887                               [b"a", b"ab", b"abc", b"abcde"]),
888        ):
889            with self.subTest(encoding=enc, repl=repl):
890                with self.assertRaises(UnicodeEncodeError) as cm:
891                    input.encode(enc, "test.replacing")
892                exc = cm.exception
893                self.assertEqual(exc.start, 1)
894                self.assertEqual(exc.end, 2)
895                self.assertEqual(exc.object, input)
896                self.assertEqual(exc.reason, "surrogates not allowed")
897
898    def test_badregistercall(self):
899        # enhance coverage of:
900        # Modules/_codecsmodule.c::register_error()
901        # Python/codecs.c::PyCodec_RegisterError()
902        self.assertRaises(TypeError, codecs.register_error, 42)
903        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
904
905    def test_badlookupcall(self):
906        # enhance coverage of:
907        # Modules/_codecsmodule.c::lookup_error()
908        self.assertRaises(TypeError, codecs.lookup_error)
909
910    def test_unknownhandler(self):
911        # enhance coverage of:
912        # Modules/_codecsmodule.c::lookup_error()
913        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
914
915    def test_xmlcharrefvalues(self):
916        # enhance coverage of:
917        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
918        # and inline implementations
919        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
920             500000, 1000000)
921        s = "".join([chr(x) for x in v])
922        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
923        for enc in ("ascii", "iso-8859-15"):
924            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
925                s.encode(enc, err)
926
927    def test_decodehelper(self):
928        # enhance coverage of:
929        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
930        # and callers
931        self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
932
933        def baddecodereturn1(exc):
934            return 42
935        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
936        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
937        self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
938        self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
939        self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
940        self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
941        self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
942
943        def baddecodereturn2(exc):
944            return ("?", None)
945        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
946        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
947
948        handler = PosReturn()
949        codecs.register_error("test.posreturn", handler.handle)
950
951        # Valid negative position
952        handler.pos = -1
953        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
954
955        # Valid negative position
956        handler.pos = -2
957        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
958
959        # Negative position out of bounds
960        handler.pos = -3
961        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
962
963        # Valid positive position
964        handler.pos = 1
965        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
966
967        # Largest valid positive position (one beyond end of input)
968        handler.pos = 2
969        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
970
971        # Invalid positive position
972        handler.pos = 3
973        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
974
975        # Restart at the "0"
976        handler.pos = 6
977        self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
978
979        class D(dict):
980            def __getitem__(self, key):
981                raise ValueError
982        self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
983        self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
984        self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
985
986    def test_encodehelper(self):
987        # enhance coverage of:
988        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
989        # and callers
990        self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
991
992        def badencodereturn1(exc):
993            return 42
994        codecs.register_error("test.badencodereturn1", badencodereturn1)
995        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
996
997        def badencodereturn2(exc):
998            return ("?", None)
999        codecs.register_error("test.badencodereturn2", badencodereturn2)
1000        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
1001
1002        handler = PosReturn()
1003        codecs.register_error("test.posreturn", handler.handle)
1004
1005        # Valid negative position
1006        handler.pos = -1
1007        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
1008
1009        # Valid negative position
1010        handler.pos = -2
1011        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
1012
1013        # Negative position out of bounds
1014        handler.pos = -3
1015        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
1016
1017        # Valid positive position
1018        handler.pos = 1
1019        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
1020
1021        # Largest valid positive position (one beyond end of input
1022        handler.pos = 2
1023        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
1024
1025        # Invalid positive position
1026        handler.pos = 3
1027        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
1028
1029        handler.pos = 0
1030
1031        class D(dict):
1032            def __getitem__(self, key):
1033                raise ValueError
1034        for err in ("strict", "replace", "xmlcharrefreplace",
1035                    "backslashreplace", "namereplace", "test.posreturn"):
1036            self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
1037            self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
1038            self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
1039
1040    def test_decodehelper_bug36819(self):
1041        handler = RepeatedPosReturn("x")
1042        codecs.register_error("test.bug36819", handler.handle)
1043
1044        testcases = [
1045            ("ascii", b"\xff"),
1046            ("utf-8", b"\xff"),
1047            ("utf-16be", b'\xdc\x80'),
1048            ("utf-32be", b'\x00\x00\xdc\x80'),
1049            ("iso-8859-6", b"\xff"),
1050        ]
1051        for enc, bad in testcases:
1052            input = "abcd".encode(enc) + bad
1053            with self.subTest(encoding=enc):
1054                handler.count = 50
1055                decoded = input.decode(enc, "test.bug36819")
1056                self.assertEqual(decoded, 'abcdx' * 51)
1057
1058    def test_encodehelper_bug36819(self):
1059        handler = RepeatedPosReturn()
1060        codecs.register_error("test.bug36819", handler.handle)
1061
1062        input = "abcd\udc80"
1063        encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
1064        encodings += ["iso-8859-15"]  # charmap codec
1065        if sys.platform == 'win32':
1066            encodings = ["mbcs", "oem"]  # code page codecs
1067
1068        handler.repl = "\udcff"
1069        for enc in encodings:
1070            with self.subTest(encoding=enc):
1071                handler.count = 50
1072                with self.assertRaises(UnicodeEncodeError) as cm:
1073                    input.encode(enc, "test.bug36819")
1074                exc = cm.exception
1075                self.assertEqual(exc.start, 4)
1076                self.assertEqual(exc.end, 5)
1077                self.assertEqual(exc.object, input)
1078        if sys.platform == "win32":
1079            handler.count = 50
1080            with self.assertRaises(UnicodeEncodeError) as cm:
1081                codecs.code_page_encode(437, input, "test.bug36819")
1082            exc = cm.exception
1083            self.assertEqual(exc.start, 4)
1084            self.assertEqual(exc.end, 5)
1085            self.assertEqual(exc.object, input)
1086
1087        handler.repl = "x"
1088        for enc in encodings:
1089            with self.subTest(encoding=enc):
1090                # The interpreter should segfault after a handful of attempts.
1091                # 50 was chosen to try to ensure a segfault without a fix,
1092                # but not OOM a machine with one.
1093                handler.count = 50
1094                encoded = input.encode(enc, "test.bug36819")
1095                self.assertEqual(encoded.decode(enc), "abcdx" * 51)
1096        if sys.platform == "win32":
1097            handler.count = 50
1098            encoded = codecs.code_page_encode(437, input, "test.bug36819")
1099            self.assertEqual(encoded[0].decode(), "abcdx" * 51)
1100            self.assertEqual(encoded[1], len(input))
1101
1102    def test_translatehelper(self):
1103        # enhance coverage of:
1104        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
1105        # and callers
1106        # (Unfortunately the errors argument is not directly accessible
1107        # from Python, so we can't test that much)
1108        class D(dict):
1109            def __getitem__(self, key):
1110                raise ValueError
1111        #self.assertRaises(ValueError, "\xff".translate, D())
1112        self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
1113        self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
1114
1115    def test_bug828737(self):
1116        charmap = {
1117            ord("&"): "&amp;",
1118            ord("<"): "&lt;",
1119            ord(">"): "&gt;",
1120            ord('"'): "&quot;",
1121        }
1122
1123        for n in (1, 10, 100, 1000):
1124            text = 'abc<def>ghi'*n
1125            text.translate(charmap)
1126
1127    def test_mutating_decode_handler(self):
1128        baddata = [
1129            ("ascii", b"\xff"),
1130            ("utf-7", b"++"),
1131            ("utf-8",  b"\xff"),
1132            ("utf-16", b"\xff"),
1133            ("utf-32", b"\xff"),
1134            ("unicode-escape", b"\\u123g"),
1135            ("raw-unicode-escape", b"\\u123g"),
1136        ]
1137
1138        def replacing(exc):
1139            if isinstance(exc, UnicodeDecodeError):
1140                exc.object = 42
1141                return ("\u4242", 0)
1142            else:
1143                raise TypeError("don't know how to handle %r" % exc)
1144        codecs.register_error("test.replacing", replacing)
1145
1146        for (encoding, data) in baddata:
1147            with self.assertRaises(TypeError):
1148                data.decode(encoding, "test.replacing")
1149
1150        def mutating(exc):
1151            if isinstance(exc, UnicodeDecodeError):
1152                exc.object = b""
1153                return ("\u4242", 0)
1154            else:
1155                raise TypeError("don't know how to handle %r" % exc)
1156        codecs.register_error("test.mutating", mutating)
1157        # If the decoder doesn't pick up the modified input the following
1158        # will lead to an endless loop
1159        for (encoding, data) in baddata:
1160            self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
1161
1162    def test_mutating_decode_handler_unicode_escape(self):
1163        decode = codecs.unicode_escape_decode
1164        def mutating(exc):
1165            if isinstance(exc, UnicodeDecodeError):
1166                r = data.get(exc.object[:exc.end])
1167                if r is not None:
1168                    exc.object = r[0] + exc.object[exc.end:]
1169                    return ('\u0404', r[1])
1170            raise AssertionError("don't know how to handle %r" % exc)
1171
1172        codecs.register_error('test.mutating2', mutating)
1173        data = {
1174            br'\x0': (b'\\', 0),
1175            br'\x3': (b'xxx\\', 3),
1176            br'\x5': (b'x\\', 1),
1177        }
1178        def check(input, expected, msg):
1179            with self.assertWarns(DeprecationWarning) as cm:
1180                self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1181            self.assertIn(msg, str(cm.warning))
1182
1183        check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1184        check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1185        check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1186
1187        check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1188        check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1189        check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1190        check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1191        check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1192
1193        check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1194        check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1195        check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1196        check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1197
1198    # issue32583
1199    def test_crashing_decode_handler(self):
1200        # better generating one more character to fill the extra space slot
1201        # so in debug build it can steadily fail
1202        def forward_shorter_than_end(exc):
1203            if isinstance(exc, UnicodeDecodeError):
1204                # size one character, 0 < forward < exc.end
1205                return ('\ufffd', exc.start+1)
1206            else:
1207                raise TypeError("don't know how to handle %r" % exc)
1208        codecs.register_error(
1209            "test.forward_shorter_than_end", forward_shorter_than_end)
1210
1211        self.assertEqual(
1212            b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
1213                'utf-16-le', 'test.forward_shorter_than_end'),
1214            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1215        )
1216        self.assertEqual(
1217            b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
1218                'utf-16-be', 'test.forward_shorter_than_end'),
1219            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1220        )
1221        self.assertEqual(
1222            b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
1223                'utf-32-le', 'test.forward_shorter_than_end'),
1224            '\ufffd\ufffd\ufffd\u1111\x00'
1225        )
1226        self.assertEqual(
1227            b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
1228                'utf-32-be', 'test.forward_shorter_than_end'),
1229            '\ufffd\ufffd\ufffd\u1111\x00'
1230        )
1231
1232        def replace_with_long(exc):
1233            if isinstance(exc, UnicodeDecodeError):
1234                exc.object = b"\x00" * 8
1235                return ('\ufffd', exc.start)
1236            else:
1237                raise TypeError("don't know how to handle %r" % exc)
1238        codecs.register_error("test.replace_with_long", replace_with_long)
1239
1240        self.assertEqual(
1241            b'\x00'.decode('utf-16', 'test.replace_with_long'),
1242            '\ufffd\x00\x00\x00\x00'
1243        )
1244        self.assertEqual(
1245            b'\x00'.decode('utf-32', 'test.replace_with_long'),
1246            '\ufffd\x00\x00'
1247        )
1248
1249
1250    def test_fake_error_class(self):
1251        handlers = [
1252            codecs.strict_errors,
1253            codecs.ignore_errors,
1254            codecs.replace_errors,
1255            codecs.backslashreplace_errors,
1256            codecs.namereplace_errors,
1257            codecs.xmlcharrefreplace_errors,
1258            codecs.lookup_error('surrogateescape'),
1259            codecs.lookup_error('surrogatepass'),
1260        ]
1261        for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
1262            class FakeUnicodeError(str):
1263                __class__ = cls
1264            for handler in handlers:
1265                with self.subTest(handler=handler, error_class=cls):
1266                    self.assertRaises(TypeError, handler, FakeUnicodeError())
1267            class FakeUnicodeError(Exception):
1268                __class__ = cls
1269            for handler in handlers:
1270                with self.subTest(handler=handler, error_class=cls):
1271                    with self.assertRaises((TypeError, FakeUnicodeError)):
1272                        handler(FakeUnicodeError())
1273
1274
1275if __name__ == "__main__":
1276    unittest.main()
1277