• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import codecs
2import contextlib
3import io
4import locale
5import sys
6import unittest
7import encodings
8from unittest import mock
9
10from test import support
11
12try:
13    import _testcapi
14except ImportError:
15    _testcapi = None
16
17try:
18    import ctypes
19except ImportError:
20    ctypes = None
21    SIZEOF_WCHAR_T = -1
22else:
23    SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
24
25def coding_checker(self, coder):
26    def check(input, expect):
27        self.assertEqual(coder(input), (expect, len(input)))
28    return check
29
30# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
32    from ctypes import POINTER, WINFUNCTYPE, WinDLL
33    from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35    MAX_LEADBYTES = 12  # 5 ranges, 2 bytes ea., 0 term.
36    MAX_DEFAULTCHAR = 2 # single or double byte
37    MAX_PATH = 260
38    class CPINFOEXW(ctypes.Structure):
39        _fields_ = [("MaxCharSize", UINT),
40                    ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41                    ("LeadByte", BYTE*MAX_LEADBYTES),
42                    ("UnicodeDefaultChar", WCHAR),
43                    ("CodePage", UINT),
44                    ("CodePageName", WCHAR*MAX_PATH)]
45
46    prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47    GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48    info = CPINFOEXW()
49    return GetCPInfoEx(cp, 0, info)
50
51class Queue(object):
52    """
53    queue: write bytes at one end, read bytes from the other end
54    """
55    def __init__(self, buffer):
56        self._buffer = buffer
57
58    def write(self, chars):
59        self._buffer += chars
60
61    def read(self, size=-1):
62        if size<0:
63            s = self._buffer
64            self._buffer = self._buffer[:0] # make empty
65            return s
66        else:
67            s = self._buffer[:size]
68            self._buffer = self._buffer[size:]
69            return s
70
71
72class MixInCheckStateHandling:
73    def check_state_handling_decode(self, encoding, u, s):
74        for i in range(len(s)+1):
75            d = codecs.getincrementaldecoder(encoding)()
76            part1 = d.decode(s[:i])
77            state = d.getstate()
78            self.assertIsInstance(state[1], int)
79            # Check that the condition stated in the documentation for
80            # IncrementalDecoder.getstate() holds
81            if not state[1]:
82                # reset decoder to the default state without anything buffered
83                d.setstate((state[0][:0], 0))
84                # Feeding the previous input may not produce any output
85                self.assertTrue(not d.decode(state[0]))
86                # The decoder must return to the same state
87                self.assertEqual(state, d.getstate())
88            # Create a new decoder and set it to the state
89            # we extracted from the old one
90            d = codecs.getincrementaldecoder(encoding)()
91            d.setstate(state)
92            part2 = d.decode(s[i:], True)
93            self.assertEqual(u, part1+part2)
94
95    def check_state_handling_encode(self, encoding, u, s):
96        for i in range(len(u)+1):
97            d = codecs.getincrementalencoder(encoding)()
98            part1 = d.encode(u[:i])
99            state = d.getstate()
100            d = codecs.getincrementalencoder(encoding)()
101            d.setstate(state)
102            part2 = d.encode(u[i:], True)
103            self.assertEqual(s, part1+part2)
104
105
106class ReadTest(MixInCheckStateHandling):
107    def check_partial(self, input, partialresults):
108        # get a StreamReader for the encoding and feed the bytestring version
109        # of input to the reader byte by byte. Read everything available from
110        # the StreamReader and check that the results equal the appropriate
111        # entries from partialresults.
112        q = Queue(b"")
113        r = codecs.getreader(self.encoding)(q)
114        result = ""
115        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
116            q.write(bytes([c]))
117            result += r.read()
118            self.assertEqual(result, partialresult)
119        # check that there's nothing left in the buffers
120        self.assertEqual(r.read(), "")
121        self.assertEqual(r.bytebuffer, b"")
122
123        # do the check again, this time using an incremental decoder
124        d = codecs.getincrementaldecoder(self.encoding)()
125        result = ""
126        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
127            result += d.decode(bytes([c]))
128            self.assertEqual(result, partialresult)
129        # check that there's nothing left in the buffers
130        self.assertEqual(d.decode(b"", True), "")
131        self.assertEqual(d.buffer, b"")
132
133        # Check whether the reset method works properly
134        d.reset()
135        result = ""
136        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
137            result += d.decode(bytes([c]))
138            self.assertEqual(result, partialresult)
139        # check that there's nothing left in the buffers
140        self.assertEqual(d.decode(b"", True), "")
141        self.assertEqual(d.buffer, b"")
142
143        # check iterdecode()
144        encoded = input.encode(self.encoding)
145        self.assertEqual(
146            input,
147            "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
148        )
149
150    def test_readline(self):
151        def getreader(input):
152            stream = io.BytesIO(input.encode(self.encoding))
153            return codecs.getreader(self.encoding)(stream)
154
155        def readalllines(input, keepends=True, size=None):
156            reader = getreader(input)
157            lines = []
158            while True:
159                line = reader.readline(size=size, keepends=keepends)
160                if not line:
161                    break
162                lines.append(line)
163            return "|".join(lines)
164
165        s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167        sexpectednoends = "foo|bar|baz|spam|eggs"
168        self.assertEqual(readalllines(s, True), sexpected)
169        self.assertEqual(readalllines(s, False), sexpectednoends)
170        self.assertEqual(readalllines(s, True, 10), sexpected)
171        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
172
173        lineends = ("\n", "\r\n", "\r", "\u2028")
174        # Test long lines (multiple calls to read() in readline())
175        vw = []
176        vwo = []
177        for (i, lineend) in enumerate(lineends):
178            vw.append((i*200+200)*"\u3042" + lineend)
179            vwo.append((i*200+200)*"\u3042")
180        self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181        self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
182
183        # Test lines where the first read might end with \r, so the
184        # reader has to look ahead whether this is a lone \r or a \r\n
185        for size in range(80):
186            for lineend in lineends:
187                s = 10*(size*"a" + lineend + "xxx\n")
188                reader = getreader(s)
189                for i in range(10):
190                    self.assertEqual(
191                        reader.readline(keepends=True),
192                        size*"a" + lineend,
193                    )
194                    self.assertEqual(
195                        reader.readline(keepends=True),
196                        "xxx\n",
197                    )
198                reader = getreader(s)
199                for i in range(10):
200                    self.assertEqual(
201                        reader.readline(keepends=False),
202                        size*"a",
203                    )
204                    self.assertEqual(
205                        reader.readline(keepends=False),
206                        "xxx",
207                    )
208
209    def test_mixed_readline_and_read(self):
210        lines = ["Humpty Dumpty sat on a wall,\n",
211                 "Humpty Dumpty had a great fall.\r\n",
212                 "All the king's horses and all the king's men\r",
213                 "Couldn't put Humpty together again."]
214        data = ''.join(lines)
215        def getreader():
216            stream = io.BytesIO(data.encode(self.encoding))
217            return codecs.getreader(self.encoding)(stream)
218
219        # Issue #8260: Test readline() followed by read()
220        f = getreader()
221        self.assertEqual(f.readline(), lines[0])
222        self.assertEqual(f.read(), ''.join(lines[1:]))
223        self.assertEqual(f.read(), '')
224
225        # Issue #32110: Test readline() followed by read(n)
226        f = getreader()
227        self.assertEqual(f.readline(), lines[0])
228        self.assertEqual(f.read(1), lines[1][0])
229        self.assertEqual(f.read(0), '')
230        self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
232        # Issue #16636: Test readline() followed by readlines()
233        f = getreader()
234        self.assertEqual(f.readline(), lines[0])
235        self.assertEqual(f.readlines(), lines[1:])
236        self.assertEqual(f.read(), '')
237
238        # Test read(n) followed by read()
239        f = getreader()
240        self.assertEqual(f.read(size=40, chars=5), data[:5])
241        self.assertEqual(f.read(), data[5:])
242        self.assertEqual(f.read(), '')
243
244        # Issue #32110: Test read(n) followed by read(n)
245        f = getreader()
246        self.assertEqual(f.read(size=40, chars=5), data[:5])
247        self.assertEqual(f.read(1), data[5])
248        self.assertEqual(f.read(0), '')
249        self.assertEqual(f.read(100), data[6:106])
250
251        # Issue #12446: Test read(n) followed by readlines()
252        f = getreader()
253        self.assertEqual(f.read(size=40, chars=5), data[:5])
254        self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255        self.assertEqual(f.read(), '')
256
257    def test_bug1175396(self):
258        s = [
259            '<%!--===================================================\r\n',
260            '    BLOG index page: show recent articles,\r\n',
261            '    today\'s articles, or articles of a specific date.\r\n',
262            '========================================================--%>\r\n',
263            '<%@inputencoding="ISO-8859-1"%>\r\n',
264            '<%@pagetemplate=TEMPLATE.y%>\r\n',
265            '<%@import=import frog.util, frog%>\r\n',
266            '<%@import=import frog.objects%>\r\n',
267            '<%@import=from frog.storageerrors import StorageError%>\r\n',
268            '<%\r\n',
269            '\r\n',
270            'import logging\r\n',
271            'log=logging.getLogger("Snakelets.logger")\r\n',
272            '\r\n',
273            '\r\n',
274            'user=self.SessionCtx.user\r\n',
275            'storageEngine=self.SessionCtx.storageEngine\r\n',
276            '\r\n',
277            '\r\n',
278            'def readArticlesFromDate(date, count=None):\r\n',
279            '    entryids=storageEngine.listBlogEntries(date)\r\n',
280            '    entryids.reverse() # descending\r\n',
281            '    if count:\r\n',
282            '        entryids=entryids[:count]\r\n',
283            '    try:\r\n',
284            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285            '    except StorageError,x:\r\n',
286            '        log.error("Error loading articles: "+str(x))\r\n',
287            '        self.abort("cannot load articles")\r\n',
288            '\r\n',
289            'showdate=None\r\n',
290            '\r\n',
291            'arg=self.Request.getArg()\r\n',
292            'if arg=="today":\r\n',
293            '    #-------------------- TODAY\'S ARTICLES\r\n',
294            '    self.write("<h2>Today\'s articles</h2>")\r\n',
295            '    showdate = frog.util.isodatestr() \r\n',
296            '    entries = readArticlesFromDate(showdate)\r\n',
297            'elif arg=="active":\r\n',
298            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
299            '    self.Yredirect("active.y")\r\n',
300            'elif arg=="login":\r\n',
301            '    #-------------------- LOGIN PAGE redirect\r\n',
302            '    self.Yredirect("login.y")\r\n',
303            'elif arg=="date":\r\n',
304            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305            '    showdate = self.Request.getParameter("date")\r\n',
306            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307            '    entries = readArticlesFromDate(showdate)\r\n',
308            'else:\r\n',
309            '    #-------------------- RECENT ARTICLES\r\n',
310            '    self.write("<h2>Recent articles</h2>")\r\n',
311            '    dates=storageEngine.listBlogEntryDates()\r\n',
312            '    if dates:\r\n',
313            '        entries=[]\r\n',
314            '        SHOWAMOUNT=10\r\n',
315            '        for showdate in dates:\r\n',
316            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317            '            if len(entries)>=SHOWAMOUNT:\r\n',
318            '                break\r\n',
319            '                \r\n',
320        ]
321        stream = io.BytesIO("".join(s).encode(self.encoding))
322        reader = codecs.getreader(self.encoding)(stream)
323        for (i, line) in enumerate(reader):
324            self.assertEqual(line, s[i])
325
326    def test_readlinequeue(self):
327        q = Queue(b"")
328        writer = codecs.getwriter(self.encoding)(q)
329        reader = codecs.getreader(self.encoding)(q)
330
331        # No lineends
332        writer.write("foo\r")
333        self.assertEqual(reader.readline(keepends=False), "foo")
334        writer.write("\nbar\r")
335        self.assertEqual(reader.readline(keepends=False), "")
336        self.assertEqual(reader.readline(keepends=False), "bar")
337        writer.write("baz")
338        self.assertEqual(reader.readline(keepends=False), "baz")
339        self.assertEqual(reader.readline(keepends=False), "")
340
341        # Lineends
342        writer.write("foo\r")
343        self.assertEqual(reader.readline(keepends=True), "foo\r")
344        writer.write("\nbar\r")
345        self.assertEqual(reader.readline(keepends=True), "\n")
346        self.assertEqual(reader.readline(keepends=True), "bar\r")
347        writer.write("baz")
348        self.assertEqual(reader.readline(keepends=True), "baz")
349        self.assertEqual(reader.readline(keepends=True), "")
350        writer.write("foo\r\n")
351        self.assertEqual(reader.readline(keepends=True), "foo\r\n")
352
353    def test_bug1098990_a(self):
354        s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355        s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356        s3 = "next line.\r\n"
357
358        s = (s1+s2+s3).encode(self.encoding)
359        stream = io.BytesIO(s)
360        reader = codecs.getreader(self.encoding)(stream)
361        self.assertEqual(reader.readline(), s1)
362        self.assertEqual(reader.readline(), s2)
363        self.assertEqual(reader.readline(), s3)
364        self.assertEqual(reader.readline(), "")
365
366    def test_bug1098990_b(self):
367        s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368        s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369        s3 = "stillokay:bbbbxx\r\n"
370        s4 = "broken!!!!badbad\r\n"
371        s5 = "againokay.\r\n"
372
373        s = (s1+s2+s3+s4+s5).encode(self.encoding)
374        stream = io.BytesIO(s)
375        reader = codecs.getreader(self.encoding)(stream)
376        self.assertEqual(reader.readline(), s1)
377        self.assertEqual(reader.readline(), s2)
378        self.assertEqual(reader.readline(), s3)
379        self.assertEqual(reader.readline(), s4)
380        self.assertEqual(reader.readline(), s5)
381        self.assertEqual(reader.readline(), "")
382
383    ill_formed_sequence_replace = "\ufffd"
384
385    def test_lone_surrogates(self):
386        self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387        self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388                         "[\\udc80]".encode(self.encoding))
389        self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390                         "[\\udc80]".encode(self.encoding))
391        self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392                         "[&#56448;]".encode(self.encoding))
393        self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394                         "[]".encode(self.encoding))
395        self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396                         "[?]".encode(self.encoding))
397
398        # sequential surrogate characters
399        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400                         "[]".encode(self.encoding))
401        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402                         "[??]".encode(self.encoding))
403
404        bom = "".encode(self.encoding)
405        for before, after in [("\U00010fff", "A"), ("[", "]"),
406                              ("A", "\U00010fff")]:
407            before_sequence = before.encode(self.encoding)[len(bom):]
408            after_sequence = after.encode(self.encoding)[len(bom):]
409            test_string = before + "\uDC80" + after
410            test_sequence = (bom + before_sequence +
411                             self.ill_formed_sequence + after_sequence)
412            self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413                              self.encoding)
414            self.assertEqual(test_string.encode(self.encoding,
415                                                "surrogatepass"),
416                             test_sequence)
417            self.assertEqual(test_sequence.decode(self.encoding,
418                                                  "surrogatepass"),
419                             test_string)
420            self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421                             before + after)
422            self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423                             before + self.ill_formed_sequence_replace + after)
424            backslashreplace = ''.join('\\x%02x' % b
425                                       for b in self.ill_formed_sequence)
426            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427                             before + backslashreplace + after)
428
429    def test_incremental_surrogatepass(self):
430        # Test incremental decoder for surrogatepass handler:
431        # see issue #24214
432        # High surrogate
433        data = '\uD901'.encode(self.encoding, 'surrogatepass')
434        for i in range(1, len(data)):
435            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436            self.assertEqual(dec.decode(data[:i]), '')
437            self.assertEqual(dec.decode(data[i:], True), '\uD901')
438        # Low surrogate
439        data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440        for i in range(1, len(data)):
441            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442            self.assertEqual(dec.decode(data[:i]), '')
443            self.assertEqual(dec.decode(data[i:]), '\uDC02')
444
445
446class UTF32Test(ReadTest, unittest.TestCase):
447    encoding = "utf-32"
448    if sys.byteorder == 'little':
449        ill_formed_sequence = b"\x80\xdc\x00\x00"
450    else:
451        ill_formed_sequence = b"\x00\x00\xdc\x80"
452
453    spamle = (b'\xff\xfe\x00\x00'
454              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456    spambe = (b'\x00\x00\xfe\xff'
457              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460    def test_only_one_bom(self):
461        _,_,reader,writer = codecs.lookup(self.encoding)
462        # encode some stream
463        s = io.BytesIO()
464        f = writer(s)
465        f.write("spam")
466        f.write("spam")
467        d = s.getvalue()
468        # check whether there is exactly one BOM in it
469        self.assertTrue(d == self.spamle or d == self.spambe)
470        # try to read it back
471        s = io.BytesIO(d)
472        f = reader(s)
473        self.assertEqual(f.read(), "spamspam")
474
475    def test_badbom(self):
476        s = io.BytesIO(4*b"\xff")
477        f = codecs.getreader(self.encoding)(s)
478        self.assertRaises(UnicodeError, f.read)
479
480        s = io.BytesIO(8*b"\xff")
481        f = codecs.getreader(self.encoding)(s)
482        self.assertRaises(UnicodeError, f.read)
483
484    def test_partial(self):
485        self.check_partial(
486            "\x00\xff\u0100\uffff\U00010000",
487            [
488                "", # first byte of BOM read
489                "", # second byte of BOM read
490                "", # third byte of BOM read
491                "", # fourth byte of BOM read => byteorder known
492                "",
493                "",
494                "",
495                "\x00",
496                "\x00",
497                "\x00",
498                "\x00",
499                "\x00\xff",
500                "\x00\xff",
501                "\x00\xff",
502                "\x00\xff",
503                "\x00\xff\u0100",
504                "\x00\xff\u0100",
505                "\x00\xff\u0100",
506                "\x00\xff\u0100",
507                "\x00\xff\u0100\uffff",
508                "\x00\xff\u0100\uffff",
509                "\x00\xff\u0100\uffff",
510                "\x00\xff\u0100\uffff",
511                "\x00\xff\u0100\uffff\U00010000",
512            ]
513        )
514
515    def test_handlers(self):
516        self.assertEqual(('\ufffd', 1),
517                         codecs.utf_32_decode(b'\x01', 'replace', True))
518        self.assertEqual(('', 1),
519                         codecs.utf_32_decode(b'\x01', 'ignore', True))
520
521    def test_errors(self):
522        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523                          b"\xff", "strict", True)
524
525    def test_decoder_state(self):
526        self.check_state_handling_decode(self.encoding,
527                                         "spamspam", self.spamle)
528        self.check_state_handling_decode(self.encoding,
529                                         "spamspam", self.spambe)
530
531    def test_issue8941(self):
532        # Issue #8941: insufficient result allocation when decoding into
533        # surrogate pairs on UCS-2 builds.
534        encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535        self.assertEqual('\U00010000' * 1024,
536                         codecs.utf_32_decode(encoded_le)[0])
537        encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538        self.assertEqual('\U00010000' * 1024,
539                         codecs.utf_32_decode(encoded_be)[0])
540
541
542class UTF32LETest(ReadTest, unittest.TestCase):
543    encoding = "utf-32-le"
544    ill_formed_sequence = b"\x80\xdc\x00\x00"
545
546    def test_partial(self):
547        self.check_partial(
548            "\x00\xff\u0100\uffff\U00010000",
549            [
550                "",
551                "",
552                "",
553                "\x00",
554                "\x00",
555                "\x00",
556                "\x00",
557                "\x00\xff",
558                "\x00\xff",
559                "\x00\xff",
560                "\x00\xff",
561                "\x00\xff\u0100",
562                "\x00\xff\u0100",
563                "\x00\xff\u0100",
564                "\x00\xff\u0100",
565                "\x00\xff\u0100\uffff",
566                "\x00\xff\u0100\uffff",
567                "\x00\xff\u0100\uffff",
568                "\x00\xff\u0100\uffff",
569                "\x00\xff\u0100\uffff\U00010000",
570            ]
571        )
572
573    def test_simple(self):
574        self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576    def test_errors(self):
577        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578                          b"\xff", "strict", True)
579
580    def test_issue8941(self):
581        # Issue #8941: insufficient result allocation when decoding into
582        # surrogate pairs on UCS-2 builds.
583        encoded = b'\x00\x00\x01\x00' * 1024
584        self.assertEqual('\U00010000' * 1024,
585                         codecs.utf_32_le_decode(encoded)[0])
586
587
588class UTF32BETest(ReadTest, unittest.TestCase):
589    encoding = "utf-32-be"
590    ill_formed_sequence = b"\x00\x00\xdc\x80"
591
592    def test_partial(self):
593        self.check_partial(
594            "\x00\xff\u0100\uffff\U00010000",
595            [
596                "",
597                "",
598                "",
599                "\x00",
600                "\x00",
601                "\x00",
602                "\x00",
603                "\x00\xff",
604                "\x00\xff",
605                "\x00\xff",
606                "\x00\xff",
607                "\x00\xff\u0100",
608                "\x00\xff\u0100",
609                "\x00\xff\u0100",
610                "\x00\xff\u0100",
611                "\x00\xff\u0100\uffff",
612                "\x00\xff\u0100\uffff",
613                "\x00\xff\u0100\uffff",
614                "\x00\xff\u0100\uffff",
615                "\x00\xff\u0100\uffff\U00010000",
616            ]
617        )
618
619    def test_simple(self):
620        self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622    def test_errors(self):
623        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624                          b"\xff", "strict", True)
625
626    def test_issue8941(self):
627        # Issue #8941: insufficient result allocation when decoding into
628        # surrogate pairs on UCS-2 builds.
629        encoded = b'\x00\x01\x00\x00' * 1024
630        self.assertEqual('\U00010000' * 1024,
631                         codecs.utf_32_be_decode(encoded)[0])
632
633
634class UTF16Test(ReadTest, unittest.TestCase):
635    encoding = "utf-16"
636    if sys.byteorder == 'little':
637        ill_formed_sequence = b"\x80\xdc"
638    else:
639        ill_formed_sequence = b"\xdc\x80"
640
641    spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642    spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
643
644    def test_only_one_bom(self):
645        _,_,reader,writer = codecs.lookup(self.encoding)
646        # encode some stream
647        s = io.BytesIO()
648        f = writer(s)
649        f.write("spam")
650        f.write("spam")
651        d = s.getvalue()
652        # check whether there is exactly one BOM in it
653        self.assertTrue(d == self.spamle or d == self.spambe)
654        # try to read it back
655        s = io.BytesIO(d)
656        f = reader(s)
657        self.assertEqual(f.read(), "spamspam")
658
659    def test_badbom(self):
660        s = io.BytesIO(b"\xff\xff")
661        f = codecs.getreader(self.encoding)(s)
662        self.assertRaises(UnicodeError, f.read)
663
664        s = io.BytesIO(b"\xff\xff\xff\xff")
665        f = codecs.getreader(self.encoding)(s)
666        self.assertRaises(UnicodeError, f.read)
667
668    def test_partial(self):
669        self.check_partial(
670            "\x00\xff\u0100\uffff\U00010000",
671            [
672                "", # first byte of BOM read
673                "", # second byte of BOM read => byteorder known
674                "",
675                "\x00",
676                "\x00",
677                "\x00\xff",
678                "\x00\xff",
679                "\x00\xff\u0100",
680                "\x00\xff\u0100",
681                "\x00\xff\u0100\uffff",
682                "\x00\xff\u0100\uffff",
683                "\x00\xff\u0100\uffff",
684                "\x00\xff\u0100\uffff",
685                "\x00\xff\u0100\uffff\U00010000",
686            ]
687        )
688
689    def test_handlers(self):
690        self.assertEqual(('\ufffd', 1),
691                         codecs.utf_16_decode(b'\x01', 'replace', True))
692        self.assertEqual(('', 1),
693                         codecs.utf_16_decode(b'\x01', 'ignore', True))
694
695    def test_errors(self):
696        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
697                          b"\xff", "strict", True)
698
699    def test_decoder_state(self):
700        self.check_state_handling_decode(self.encoding,
701                                         "spamspam", self.spamle)
702        self.check_state_handling_decode(self.encoding,
703                                         "spamspam", self.spambe)
704
705    def test_bug691291(self):
706        # Files are always opened in binary mode, even if no binary mode was
707        # specified.  This means that no automatic conversion of '\n' is done
708        # on reading and writing.
709        s1 = 'Hello\r\nworld\r\n'
710
711        s = s1.encode(self.encoding)
712        self.addCleanup(support.unlink, support.TESTFN)
713        with open(support.TESTFN, 'wb') as fp:
714            fp.write(s)
715        with support.check_warnings(('', DeprecationWarning)):
716            reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
717        with reader:
718            self.assertEqual(reader.read(), s1)
719
720class UTF16LETest(ReadTest, unittest.TestCase):
721    encoding = "utf-16-le"
722    ill_formed_sequence = b"\x80\xdc"
723
724    def test_partial(self):
725        self.check_partial(
726            "\x00\xff\u0100\uffff\U00010000",
727            [
728                "",
729                "\x00",
730                "\x00",
731                "\x00\xff",
732                "\x00\xff",
733                "\x00\xff\u0100",
734                "\x00\xff\u0100",
735                "\x00\xff\u0100\uffff",
736                "\x00\xff\u0100\uffff",
737                "\x00\xff\u0100\uffff",
738                "\x00\xff\u0100\uffff",
739                "\x00\xff\u0100\uffff\U00010000",
740            ]
741        )
742
743    def test_errors(self):
744        tests = [
745            (b'\xff', '\ufffd'),
746            (b'A\x00Z', 'A\ufffd'),
747            (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
748            (b'\x00\xd8', '\ufffd'),
749            (b'\x00\xd8A', '\ufffd'),
750            (b'\x00\xd8A\x00', '\ufffdA'),
751            (b'\x00\xdcA\x00', '\ufffdA'),
752        ]
753        for raw, expected in tests:
754            self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
755                              raw, 'strict', True)
756            self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
757
758    def test_nonbmp(self):
759        self.assertEqual("\U00010203".encode(self.encoding),
760                         b'\x00\xd8\x03\xde')
761        self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
762                         "\U00010203")
763
764class UTF16BETest(ReadTest, unittest.TestCase):
765    encoding = "utf-16-be"
766    ill_formed_sequence = b"\xdc\x80"
767
768    def test_partial(self):
769        self.check_partial(
770            "\x00\xff\u0100\uffff\U00010000",
771            [
772                "",
773                "\x00",
774                "\x00",
775                "\x00\xff",
776                "\x00\xff",
777                "\x00\xff\u0100",
778                "\x00\xff\u0100",
779                "\x00\xff\u0100\uffff",
780                "\x00\xff\u0100\uffff",
781                "\x00\xff\u0100\uffff",
782                "\x00\xff\u0100\uffff",
783                "\x00\xff\u0100\uffff\U00010000",
784            ]
785        )
786
787    def test_errors(self):
788        tests = [
789            (b'\xff', '\ufffd'),
790            (b'\x00A\xff', 'A\ufffd'),
791            (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
792            (b'\xd8\x00', '\ufffd'),
793            (b'\xd8\x00\xdc', '\ufffd'),
794            (b'\xd8\x00\x00A', '\ufffdA'),
795            (b'\xdc\x00\x00A', '\ufffdA'),
796        ]
797        for raw, expected in tests:
798            self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
799                              raw, 'strict', True)
800            self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
801
802    def test_nonbmp(self):
803        self.assertEqual("\U00010203".encode(self.encoding),
804                         b'\xd8\x00\xde\x03')
805        self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
806                         "\U00010203")
807
808class UTF8Test(ReadTest, unittest.TestCase):
809    encoding = "utf-8"
810    ill_formed_sequence = b"\xed\xb2\x80"
811    ill_formed_sequence_replace = "\ufffd" * 3
812    BOM = b''
813
814    def test_partial(self):
815        self.check_partial(
816            "\x00\xff\u07ff\u0800\uffff\U00010000",
817            [
818                "\x00",
819                "\x00",
820                "\x00\xff",
821                "\x00\xff",
822                "\x00\xff\u07ff",
823                "\x00\xff\u07ff",
824                "\x00\xff\u07ff",
825                "\x00\xff\u07ff\u0800",
826                "\x00\xff\u07ff\u0800",
827                "\x00\xff\u07ff\u0800",
828                "\x00\xff\u07ff\u0800\uffff",
829                "\x00\xff\u07ff\u0800\uffff",
830                "\x00\xff\u07ff\u0800\uffff",
831                "\x00\xff\u07ff\u0800\uffff",
832                "\x00\xff\u07ff\u0800\uffff\U00010000",
833            ]
834        )
835
836    def test_decoder_state(self):
837        u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
838        self.check_state_handling_decode(self.encoding,
839                                         u, u.encode(self.encoding))
840
841    def test_decode_error(self):
842        for data, error_handler, expected in (
843            (b'[\x80\xff]', 'ignore', '[]'),
844            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
845            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
846            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
847        ):
848            with self.subTest(data=data, error_handler=error_handler,
849                              expected=expected):
850                self.assertEqual(data.decode(self.encoding, error_handler),
851                                 expected)
852
853    def test_lone_surrogates(self):
854        super().test_lone_surrogates()
855        # not sure if this is making sense for
856        # UTF-16 and UTF-32
857        self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
858                         self.BOM + b'[\x80]')
859
860        with self.assertRaises(UnicodeEncodeError) as cm:
861            "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
862        exc = cm.exception
863        self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
864
865    def test_surrogatepass_handler(self):
866        self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
867                         self.BOM + b"abc\xed\xa0\x80def")
868        self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
869                         self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
870        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
871                         self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
872
873        self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
874                         "abc\ud800def")
875        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
876                         "\U00010fff\uD800")
877
878        self.assertTrue(codecs.lookup_error("surrogatepass"))
879        with self.assertRaises(UnicodeDecodeError):
880            b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
881        with self.assertRaises(UnicodeDecodeError):
882            b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
883
884    def test_incremental_errors(self):
885        # Test that the incremental decoder can fail with final=False.
886        # See issue #24214
887        cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
888        for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
889                       b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
890                       b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
891            for suffix in b'\x7F', b'\xC0':
892                cases.append(prefix + suffix)
893        cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
894                      b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
895
896        for data in cases:
897            with self.subTest(data=data):
898                dec = codecs.getincrementaldecoder(self.encoding)()
899                self.assertRaises(UnicodeDecodeError, dec.decode, data)
900
901
902class UTF7Test(ReadTest, unittest.TestCase):
903    encoding = "utf-7"
904
905    def test_ascii(self):
906        # Set D (directly encoded characters)
907        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
908                 'abcdefghijklmnopqrstuvwxyz'
909                 '0123456789'
910                 '\'(),-./:?')
911        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
912        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
913        # Set O (optional direct characters)
914        set_o = ' !"#$%&*;<=>@[]^_`{|}'
915        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
916        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
917        # +
918        self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
919        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
920        # White spaces
921        ws = ' \t\n\r'
922        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
923        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
924        # Other ASCII characters
925        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
926                                     set(set_d + set_o + '+' + ws)))
927        self.assertEqual(other_ascii.encode(self.encoding),
928                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
929                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
930
931    def test_partial(self):
932        self.check_partial(
933            'a+-b\x00c\x80d\u0100e\U00010000f',
934            [
935                'a',
936                'a',
937                'a+',
938                'a+-',
939                'a+-b',
940                'a+-b',
941                'a+-b',
942                'a+-b',
943                'a+-b',
944                'a+-b\x00',
945                'a+-b\x00c',
946                'a+-b\x00c',
947                'a+-b\x00c',
948                'a+-b\x00c',
949                'a+-b\x00c',
950                'a+-b\x00c\x80',
951                'a+-b\x00c\x80d',
952                'a+-b\x00c\x80d',
953                'a+-b\x00c\x80d',
954                'a+-b\x00c\x80d',
955                'a+-b\x00c\x80d',
956                'a+-b\x00c\x80d\u0100',
957                'a+-b\x00c\x80d\u0100e',
958                'a+-b\x00c\x80d\u0100e',
959                'a+-b\x00c\x80d\u0100e',
960                'a+-b\x00c\x80d\u0100e',
961                'a+-b\x00c\x80d\u0100e',
962                'a+-b\x00c\x80d\u0100e',
963                'a+-b\x00c\x80d\u0100e',
964                'a+-b\x00c\x80d\u0100e',
965                'a+-b\x00c\x80d\u0100e\U00010000',
966                'a+-b\x00c\x80d\u0100e\U00010000f',
967            ]
968        )
969
970    def test_errors(self):
971        tests = [
972            (b'\xffb', '\ufffdb'),
973            (b'a\xffb', 'a\ufffdb'),
974            (b'a\xff\xffb', 'a\ufffd\ufffdb'),
975            (b'a+IK', 'a\ufffd'),
976            (b'a+IK-b', 'a\ufffdb'),
977            (b'a+IK,b', 'a\ufffdb'),
978            (b'a+IKx', 'a\u20ac\ufffd'),
979            (b'a+IKx-b', 'a\u20ac\ufffdb'),
980            (b'a+IKwgr', 'a\u20ac\ufffd'),
981            (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
982            (b'a+IKwgr,', 'a\u20ac\ufffd'),
983            (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
984            (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
985            (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
986            (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
987            (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
988            (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
989            (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
990            (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
991            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
992            (b'a+@b', 'a\ufffdb'),
993        ]
994        for raw, expected in tests:
995            with self.subTest(raw=raw):
996                self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
997                                raw, 'strict', True)
998                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
999
1000    def test_nonbmp(self):
1001        self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1002        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1003        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
1004        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1005        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1006        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1007        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1008        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1009                         b'+IKwgrNgB3KA-')
1010        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1011                         '\u20ac\u20ac\U000104A0')
1012        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1013                         '\u20ac\u20ac\U000104A0')
1014
1015    def test_lone_surrogates(self):
1016        tests = [
1017            (b'a+2AE-b', 'a\ud801b'),
1018            (b'a+2AE\xffb', 'a\ufffdb'),
1019            (b'a+2AE', 'a\ufffd'),
1020            (b'a+2AEA-b', 'a\ufffdb'),
1021            (b'a+2AH-b', 'a\ufffdb'),
1022            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1023            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1024            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1025            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1026            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1027            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1028            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1029            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1030        ]
1031        for raw, expected in tests:
1032            with self.subTest(raw=raw):
1033                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1034
1035
1036class UTF16ExTest(unittest.TestCase):
1037
1038    def test_errors(self):
1039        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
1040
1041    def test_bad_args(self):
1042        self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1043
1044class ReadBufferTest(unittest.TestCase):
1045
1046    def test_array(self):
1047        import array
1048        self.assertEqual(
1049            codecs.readbuffer_encode(array.array("b", b"spam")),
1050            (b"spam", 4)
1051        )
1052
1053    def test_empty(self):
1054        self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
1055
1056    def test_bad_args(self):
1057        self.assertRaises(TypeError, codecs.readbuffer_encode)
1058        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1059
1060class UTF8SigTest(UTF8Test, unittest.TestCase):
1061    encoding = "utf-8-sig"
1062    BOM = codecs.BOM_UTF8
1063
1064    def test_partial(self):
1065        self.check_partial(
1066            "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1067            [
1068                "",
1069                "",
1070                "", # First BOM has been read and skipped
1071                "",
1072                "",
1073                "\ufeff", # Second BOM has been read and emitted
1074                "\ufeff\x00", # "\x00" read and emitted
1075                "\ufeff\x00", # First byte of encoded "\xff" read
1076                "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1077                "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1078                "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
1079                "\ufeff\x00\xff\u07ff",
1080                "\ufeff\x00\xff\u07ff",
1081                "\ufeff\x00\xff\u07ff\u0800",
1082                "\ufeff\x00\xff\u07ff\u0800",
1083                "\ufeff\x00\xff\u07ff\u0800",
1084                "\ufeff\x00\xff\u07ff\u0800\uffff",
1085                "\ufeff\x00\xff\u07ff\u0800\uffff",
1086                "\ufeff\x00\xff\u07ff\u0800\uffff",
1087                "\ufeff\x00\xff\u07ff\u0800\uffff",
1088                "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1089            ]
1090        )
1091
1092    def test_bug1601501(self):
1093        # SF bug #1601501: check that the codec works with a buffer
1094        self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
1095
1096    def test_bom(self):
1097        d = codecs.getincrementaldecoder("utf-8-sig")()
1098        s = "spam"
1099        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1100
1101    def test_stream_bom(self):
1102        unistring = "ABC\u00A1\u2200XYZ"
1103        bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1104
1105        reader = codecs.getreader("utf-8-sig")
1106        for sizehint in [None] + list(range(1, 11)) + \
1107                        [64, 128, 256, 512, 1024]:
1108            istream = reader(io.BytesIO(bytestring))
1109            ostream = io.StringIO()
1110            while 1:
1111                if sizehint is not None:
1112                    data = istream.read(sizehint)
1113                else:
1114                    data = istream.read()
1115
1116                if not data:
1117                    break
1118                ostream.write(data)
1119
1120            got = ostream.getvalue()
1121            self.assertEqual(got, unistring)
1122
1123    def test_stream_bare(self):
1124        unistring = "ABC\u00A1\u2200XYZ"
1125        bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1126
1127        reader = codecs.getreader("utf-8-sig")
1128        for sizehint in [None] + list(range(1, 11)) + \
1129                        [64, 128, 256, 512, 1024]:
1130            istream = reader(io.BytesIO(bytestring))
1131            ostream = io.StringIO()
1132            while 1:
1133                if sizehint is not None:
1134                    data = istream.read(sizehint)
1135                else:
1136                    data = istream.read()
1137
1138                if not data:
1139                    break
1140                ostream.write(data)
1141
1142            got = ostream.getvalue()
1143            self.assertEqual(got, unistring)
1144
1145
1146class EscapeDecodeTest(unittest.TestCase):
1147    def test_empty(self):
1148        self.assertEqual(codecs.escape_decode(b""), (b"", 0))
1149        self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
1150
1151    def test_raw(self):
1152        decode = codecs.escape_decode
1153        for b in range(256):
1154            b = bytes([b])
1155            if b != b'\\':
1156                self.assertEqual(decode(b + b'0'), (b + b'0', 2))
1157
1158    def test_escape(self):
1159        decode = codecs.escape_decode
1160        check = coding_checker(self, decode)
1161        check(b"[\\\n]", b"[]")
1162        check(br'[\"]', b'["]')
1163        check(br"[\']", b"[']")
1164        check(br"[\\]", b"[\\]")
1165        check(br"[\a]", b"[\x07]")
1166        check(br"[\b]", b"[\x08]")
1167        check(br"[\t]", b"[\x09]")
1168        check(br"[\n]", b"[\x0a]")
1169        check(br"[\v]", b"[\x0b]")
1170        check(br"[\f]", b"[\x0c]")
1171        check(br"[\r]", b"[\x0d]")
1172        check(br"[\7]", b"[\x07]")
1173        check(br"[\78]", b"[\x078]")
1174        check(br"[\41]", b"[!]")
1175        check(br"[\418]", b"[!8]")
1176        check(br"[\101]", b"[A]")
1177        check(br"[\1010]", b"[A0]")
1178        check(br"[\501]", b"[A]")
1179        check(br"[\x41]", b"[A]")
1180        check(br"[\x410]", b"[A0]")
1181        for i in range(97, 123):
1182            b = bytes([i])
1183            if b not in b'abfnrtvx':
1184                with self.assertWarns(DeprecationWarning):
1185                    check(b"\\" + b, b"\\" + b)
1186            with self.assertWarns(DeprecationWarning):
1187                check(b"\\" + b.upper(), b"\\" + b.upper())
1188        with self.assertWarns(DeprecationWarning):
1189            check(br"\8", b"\\8")
1190        with self.assertWarns(DeprecationWarning):
1191            check(br"\9", b"\\9")
1192        with self.assertWarns(DeprecationWarning):
1193            check(b"\\\xfa", b"\\\xfa")
1194
1195    def test_errors(self):
1196        decode = codecs.escape_decode
1197        self.assertRaises(ValueError, decode, br"\x")
1198        self.assertRaises(ValueError, decode, br"[\x]")
1199        self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1200        self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1201        self.assertRaises(ValueError, decode, br"\x0")
1202        self.assertRaises(ValueError, decode, br"[\x0]")
1203        self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1204        self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
1205
1206
1207# From RFC 3492
1208punycode_testcases = [
1209    # A Arabic (Egyptian):
1210    ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1211     "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
1212     b"egbpdaj6bu4bxfgehfvwxn"),
1213    # B Chinese (simplified):
1214    ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
1215     b"ihqwcrb4cv8a8dqg056pqjye"),
1216    # C Chinese (traditional):
1217    ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
1218     b"ihqwctvzc91f659drss3x8bo0yb"),
1219    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
1220    ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1221     "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1222     "\u0065\u0073\u006B\u0079",
1223     b"Proprostnemluvesky-uyb24dma41a"),
1224    # E Hebrew:
1225    ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1226     "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1227     "\u05D1\u05E8\u05D9\u05EA",
1228     b"4dbcagdahymbxekheh6e0a7fei0b"),
1229    # F Hindi (Devanagari):
1230    ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
1231     "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1232     "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1233     "\u0939\u0948\u0902",
1234     b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
1235
1236    #(G) Japanese (kanji and hiragana):
1237    ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
1238     "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1239     b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
1240
1241    # (H) Korean (Hangul syllables):
1242    ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1243     "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1244     "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
1245     b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1246     b"psd879ccm6fea98c"),
1247
1248    # (I) Russian (Cyrillic):
1249    ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1250     "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1251     "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1252     "\u0438",
1253     b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1254
1255    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1256    ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1257     "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1258     "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1259     "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1260     "\u0061\u00F1\u006F\u006C",
1261     b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
1262
1263    # (K) Vietnamese:
1264    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1265    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1266    ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1267     "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1268     "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1269     "\u0056\u0069\u1EC7\u0074",
1270     b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1271
1272    #(L) 3<nen>B<gumi><kinpachi><sensei>
1273    ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1274     b"3B-ww4c5e180e575a65lsy2b"),
1275
1276    # (M) <amuro><namie>-with-SUPER-MONKEYS
1277    ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1278     "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1279     "\u004F\u004E\u004B\u0045\u0059\u0053",
1280     b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1281
1282    # (N) Hello-Another-Way-<sorezore><no><basho>
1283    ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1284     "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1285     "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1286     b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1287
1288    # (O) <hitotsu><yane><no><shita>2
1289    ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1290     b"2-u9tlzr9756bt3uc0v"),
1291
1292    # (P) Maji<de>Koi<suru>5<byou><mae>
1293    ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1294     "\u308B\u0035\u79D2\u524D",
1295     b"MajiKoi5-783gue6qz075azm5e"),
1296
1297     # (Q) <pafii>de<runba>
1298    ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1299     b"de-jg4avhby1noc0d"),
1300
1301    # (R) <sono><supiido><de>
1302    ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1303     b"d9juau41awczczp"),
1304
1305    # (S) -> $1.00 <-
1306    ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1307     "\u003C\u002D",
1308     b"-> $1.00 <--")
1309    ]
1310
1311for i in punycode_testcases:
1312    if len(i)!=2:
1313        print(repr(i))
1314
1315
1316class PunycodeTest(unittest.TestCase):
1317    def test_encode(self):
1318        for uni, puny in punycode_testcases:
1319            # Need to convert both strings to lower case, since
1320            # some of the extended encodings use upper case, but our
1321            # code produces only lower case. Converting just puny to
1322            # lower is also insufficient, since some of the input characters
1323            # are upper case.
1324            self.assertEqual(
1325                str(uni.encode("punycode"), "ascii").lower(),
1326                str(puny, "ascii").lower()
1327            )
1328
1329    def test_decode(self):
1330        for uni, puny in punycode_testcases:
1331            self.assertEqual(uni, puny.decode("punycode"))
1332            puny = puny.decode("ascii").encode("ascii")
1333            self.assertEqual(uni, puny.decode("punycode"))
1334
1335    def test_decode_invalid(self):
1336        testcases = [
1337            (b"xn--w&", "strict", UnicodeError()),
1338            (b"xn--w&", "ignore", "xn-"),
1339        ]
1340        for puny, errors, expected in testcases:
1341            with self.subTest(puny=puny, errors=errors):
1342                if isinstance(expected, Exception):
1343                    self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1344                else:
1345                    self.assertEqual(puny.decode("punycode", errors), expected)
1346
1347
1348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1349nameprep_tests = [
1350    # 3.1 Map to nothing.
1351    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1352     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1353     b'\xb8\x8f\xef\xbb\xbf',
1354     b'foobarbaz'),
1355    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1356    (b'CAFE',
1357     b'cafe'),
1358    # 3.3 Case folding 8bit U+00DF (german sharp s).
1359    # The original test case is bogus; it says \xc3\xdf
1360    (b'\xc3\x9f',
1361     b'ss'),
1362    # 3.4 Case folding U+0130 (turkish capital I with dot).
1363    (b'\xc4\xb0',
1364     b'i\xcc\x87'),
1365    # 3.5 Case folding multibyte U+0143 U+037A.
1366    (b'\xc5\x83\xcd\xba',
1367     b'\xc5\x84 \xce\xb9'),
1368    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1369    # XXX: skip this as it fails in UCS-2 mode
1370    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1371    # 'telc\xe2\x88\x95kg\xcf\x83'),
1372    (None, None),
1373    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1374    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1375     b'\xc7\xb0 a'),
1376    # 3.8 Case folding U+1FB7 and normalization.
1377    (b'\xe1\xbe\xb7',
1378     b'\xe1\xbe\xb6\xce\xb9'),
1379    # 3.9 Self-reverting case folding U+01F0 and normalization.
1380    # The original test case is bogus, it says `\xc7\xf0'
1381    (b'\xc7\xb0',
1382     b'\xc7\xb0'),
1383    # 3.10 Self-reverting case folding U+0390 and normalization.
1384    (b'\xce\x90',
1385     b'\xce\x90'),
1386    # 3.11 Self-reverting case folding U+03B0 and normalization.
1387    (b'\xce\xb0',
1388     b'\xce\xb0'),
1389    # 3.12 Self-reverting case folding U+1E96 and normalization.
1390    (b'\xe1\xba\x96',
1391     b'\xe1\xba\x96'),
1392    # 3.13 Self-reverting case folding U+1F56 and normalization.
1393    (b'\xe1\xbd\x96',
1394     b'\xe1\xbd\x96'),
1395    # 3.14 ASCII space character U+0020.
1396    (b' ',
1397     b' '),
1398    # 3.15 Non-ASCII 8bit space character U+00A0.
1399    (b'\xc2\xa0',
1400     b' '),
1401    # 3.16 Non-ASCII multibyte space character U+1680.
1402    (b'\xe1\x9a\x80',
1403     None),
1404    # 3.17 Non-ASCII multibyte space character U+2000.
1405    (b'\xe2\x80\x80',
1406     b' '),
1407    # 3.18 Zero Width Space U+200b.
1408    (b'\xe2\x80\x8b',
1409     b''),
1410    # 3.19 Non-ASCII multibyte space character U+3000.
1411    (b'\xe3\x80\x80',
1412     b' '),
1413    # 3.20 ASCII control characters U+0010 U+007F.
1414    (b'\x10\x7f',
1415     b'\x10\x7f'),
1416    # 3.21 Non-ASCII 8bit control character U+0085.
1417    (b'\xc2\x85',
1418     None),
1419    # 3.22 Non-ASCII multibyte control character U+180E.
1420    (b'\xe1\xa0\x8e',
1421     None),
1422    # 3.23 Zero Width No-Break Space U+FEFF.
1423    (b'\xef\xbb\xbf',
1424     b''),
1425    # 3.24 Non-ASCII control character U+1D175.
1426    (b'\xf0\x9d\x85\xb5',
1427     None),
1428    # 3.25 Plane 0 private use character U+F123.
1429    (b'\xef\x84\xa3',
1430     None),
1431    # 3.26 Plane 15 private use character U+F1234.
1432    (b'\xf3\xb1\x88\xb4',
1433     None),
1434    # 3.27 Plane 16 private use character U+10F234.
1435    (b'\xf4\x8f\x88\xb4',
1436     None),
1437    # 3.28 Non-character code point U+8FFFE.
1438    (b'\xf2\x8f\xbf\xbe',
1439     None),
1440    # 3.29 Non-character code point U+10FFFF.
1441    (b'\xf4\x8f\xbf\xbf',
1442     None),
1443    # 3.30 Surrogate code U+DF42.
1444    (b'\xed\xbd\x82',
1445     None),
1446    # 3.31 Non-plain text character U+FFFD.
1447    (b'\xef\xbf\xbd',
1448     None),
1449    # 3.32 Ideographic description character U+2FF5.
1450    (b'\xe2\xbf\xb5',
1451     None),
1452    # 3.33 Display property character U+0341.
1453    (b'\xcd\x81',
1454     b'\xcc\x81'),
1455    # 3.34 Left-to-right mark U+200E.
1456    (b'\xe2\x80\x8e',
1457     None),
1458    # 3.35 Deprecated U+202A.
1459    (b'\xe2\x80\xaa',
1460     None),
1461    # 3.36 Language tagging character U+E0001.
1462    (b'\xf3\xa0\x80\x81',
1463     None),
1464    # 3.37 Language tagging character U+E0042.
1465    (b'\xf3\xa0\x81\x82',
1466     None),
1467    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1468    (b'foo\xd6\xbebar',
1469     None),
1470    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1471    (b'foo\xef\xb5\x90bar',
1472     None),
1473    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1474    (b'foo\xef\xb9\xb6bar',
1475     b'foo \xd9\x8ebar'),
1476    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1477    (b'\xd8\xa71',
1478     None),
1479    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1480    (b'\xd8\xa71\xd8\xa8',
1481     b'\xd8\xa71\xd8\xa8'),
1482    # 3.43 Unassigned code point U+E0002.
1483    # Skip this test as we allow unassigned
1484    #(b'\xf3\xa0\x80\x82',
1485    # None),
1486    (None, None),
1487    # 3.44 Larger test (shrinking).
1488    # Original test case reads \xc3\xdf
1489    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1490     b'\xaa\xce\xb0\xe2\x80\x80',
1491     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1492    # 3.45 Larger test (expanding).
1493    # Original test case reads \xc3\x9f
1494    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1495     b'\x80',
1496     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1497     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1498     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1499    ]
1500
1501
1502class NameprepTest(unittest.TestCase):
1503    def test_nameprep(self):
1504        from encodings.idna import nameprep
1505        for pos, (orig, prepped) in enumerate(nameprep_tests):
1506            if orig is None:
1507                # Skipped
1508                continue
1509            # The Unicode strings are given in UTF-8
1510            orig = str(orig, "utf-8", "surrogatepass")
1511            if prepped is None:
1512                # Input contains prohibited characters
1513                self.assertRaises(UnicodeError, nameprep, orig)
1514            else:
1515                prepped = str(prepped, "utf-8", "surrogatepass")
1516                try:
1517                    self.assertEqual(nameprep(orig), prepped)
1518                except Exception as e:
1519                    raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1520
1521
1522class IDNACodecTest(unittest.TestCase):
1523    def test_builtin_decode(self):
1524        self.assertEqual(str(b"python.org", "idna"), "python.org")
1525        self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1526        self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1527        self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
1528
1529    def test_builtin_encode(self):
1530        self.assertEqual("python.org".encode("idna"), b"python.org")
1531        self.assertEqual("python.org.".encode("idna"), b"python.org.")
1532        self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1533        self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
1534
1535    def test_builtin_decode_length_limit(self):
1536        with self.assertRaisesRegex(UnicodeError, "too long"):
1537            (b"xn--016c"+b"a"*1100).decode("idna")
1538        with self.assertRaisesRegex(UnicodeError, "too long"):
1539            (b"xn--016c"+b"a"*70).decode("idna")
1540
1541    def test_stream(self):
1542        r = codecs.getreader("idna")(io.BytesIO(b"abc"))
1543        r.read(3)
1544        self.assertEqual(r.read(), "")
1545
1546    def test_incremental_decode(self):
1547        self.assertEqual(
1548            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
1549            "python.org"
1550        )
1551        self.assertEqual(
1552            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
1553            "python.org."
1554        )
1555        self.assertEqual(
1556            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1557            "pyth\xf6n.org."
1558        )
1559        self.assertEqual(
1560            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1561            "pyth\xf6n.org."
1562        )
1563
1564        decoder = codecs.getincrementaldecoder("idna")()
1565        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1566        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1567        self.assertEqual(decoder.decode(b"rg"), "")
1568        self.assertEqual(decoder.decode(b"", True), "org")
1569
1570        decoder.reset()
1571        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1572        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1573        self.assertEqual(decoder.decode(b"rg."), "org.")
1574        self.assertEqual(decoder.decode(b"", True), "")
1575
1576    def test_incremental_encode(self):
1577        self.assertEqual(
1578            b"".join(codecs.iterencode("python.org", "idna")),
1579            b"python.org"
1580        )
1581        self.assertEqual(
1582            b"".join(codecs.iterencode("python.org.", "idna")),
1583            b"python.org."
1584        )
1585        self.assertEqual(
1586            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1587            b"xn--pythn-mua.org."
1588        )
1589        self.assertEqual(
1590            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1591            b"xn--pythn-mua.org."
1592        )
1593
1594        encoder = codecs.getincrementalencoder("idna")()
1595        self.assertEqual(encoder.encode("\xe4x"), b"")
1596        self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1597        self.assertEqual(encoder.encode("", True), b"org")
1598
1599        encoder.reset()
1600        self.assertEqual(encoder.encode("\xe4x"), b"")
1601        self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1602        self.assertEqual(encoder.encode("", True), b"")
1603
1604    def test_errors(self):
1605        """Only supports "strict" error handler"""
1606        "python.org".encode("idna", "strict")
1607        b"python.org".decode("idna", "strict")
1608        for errors in ("ignore", "replace", "backslashreplace",
1609                "surrogateescape"):
1610            self.assertRaises(Exception, "python.org".encode, "idna", errors)
1611            self.assertRaises(Exception,
1612                b"python.org".decode, "idna", errors)
1613
1614
1615class CodecsModuleTest(unittest.TestCase):
1616
1617    def test_decode(self):
1618        self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1619                         '\xe4\xf6\xfc')
1620        self.assertRaises(TypeError, codecs.decode)
1621        self.assertEqual(codecs.decode(b'abc'), 'abc')
1622        self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
1623
1624        # test keywords
1625        self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1626                         '\xe4\xf6\xfc')
1627        self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1628                         '[]')
1629
1630    def test_encode(self):
1631        self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1632                         b'\xe4\xf6\xfc')
1633        self.assertRaises(TypeError, codecs.encode)
1634        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1635        self.assertEqual(codecs.encode('abc'), b'abc')
1636        self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
1637
1638        # test keywords
1639        self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1640                         b'\xe4\xf6\xfc')
1641        self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1642                         b'[]')
1643
1644    def test_register(self):
1645        self.assertRaises(TypeError, codecs.register)
1646        self.assertRaises(TypeError, codecs.register, 42)
1647
1648    def test_lookup(self):
1649        self.assertRaises(TypeError, codecs.lookup)
1650        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1651        self.assertRaises(LookupError, codecs.lookup, " ")
1652
1653    def test_getencoder(self):
1654        self.assertRaises(TypeError, codecs.getencoder)
1655        self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1656
1657    def test_getdecoder(self):
1658        self.assertRaises(TypeError, codecs.getdecoder)
1659        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1660
1661    def test_getreader(self):
1662        self.assertRaises(TypeError, codecs.getreader)
1663        self.assertRaises(LookupError, codecs.getreader, "__spam__")
1664
1665    def test_getwriter(self):
1666        self.assertRaises(TypeError, codecs.getwriter)
1667        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1668
1669    def test_lookup_issue1813(self):
1670        # Issue #1813: under Turkish locales, lookup of some codecs failed
1671        # because 'I' is lowercased as "ı" (dotless i)
1672        oldlocale = locale.setlocale(locale.LC_CTYPE)
1673        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1674        try:
1675            locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1676        except locale.Error:
1677            # Unsupported locale on this system
1678            self.skipTest('test needs Turkish locale')
1679        c = codecs.lookup('ASCII')
1680        self.assertEqual(c.name, 'ascii')
1681
1682    def test_all(self):
1683        api = (
1684            "encode", "decode",
1685            "register", "CodecInfo", "Codec", "IncrementalEncoder",
1686            "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1687            "getencoder", "getdecoder", "getincrementalencoder",
1688            "getincrementaldecoder", "getreader", "getwriter",
1689            "register_error", "lookup_error",
1690            "strict_errors", "replace_errors", "ignore_errors",
1691            "xmlcharrefreplace_errors", "backslashreplace_errors",
1692            "namereplace_errors",
1693            "open", "EncodedFile",
1694            "iterencode", "iterdecode",
1695            "BOM", "BOM_BE", "BOM_LE",
1696            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1697            "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1698            "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
1699            "StreamReaderWriter", "StreamRecoder",
1700        )
1701        self.assertCountEqual(api, codecs.__all__)
1702        for api in codecs.__all__:
1703            getattr(codecs, api)
1704
1705    def test_open(self):
1706        self.addCleanup(support.unlink, support.TESTFN)
1707        for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1708            with self.subTest(mode), \
1709                    codecs.open(support.TESTFN, mode, 'ascii') as file:
1710                self.assertIsInstance(file, codecs.StreamReaderWriter)
1711
1712    def test_undefined(self):
1713        self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1714        self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1715        self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1716        self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1717        for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1718            self.assertRaises(UnicodeError,
1719                codecs.encode, 'abc', 'undefined', errors)
1720            self.assertRaises(UnicodeError,
1721                codecs.decode, b'abc', 'undefined', errors)
1722
1723    def test_file_closes_if_lookup_error_raised(self):
1724        mock_open = mock.mock_open()
1725        with mock.patch('builtins.open', mock_open) as file:
1726            with self.assertRaises(LookupError):
1727                codecs.open(support.TESTFN, 'wt', 'invalid-encoding')
1728
1729            file().close.assert_called()
1730
1731
1732class StreamReaderTest(unittest.TestCase):
1733
1734    def setUp(self):
1735        self.reader = codecs.getreader('utf-8')
1736        self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1737
1738    def test_readlines(self):
1739        f = self.reader(self.stream)
1740        self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
1741
1742
1743class EncodedFileTest(unittest.TestCase):
1744
1745    def test_basic(self):
1746        f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1747        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1748        self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
1749
1750        f = io.BytesIO()
1751        ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
1752        ef.write(b'\xc3\xbc')
1753        self.assertEqual(f.getvalue(), b'\xfc')
1754
1755all_unicode_encodings = [
1756    "ascii",
1757    "big5",
1758    "big5hkscs",
1759    "charmap",
1760    "cp037",
1761    "cp1006",
1762    "cp1026",
1763    "cp1125",
1764    "cp1140",
1765    "cp1250",
1766    "cp1251",
1767    "cp1252",
1768    "cp1253",
1769    "cp1254",
1770    "cp1255",
1771    "cp1256",
1772    "cp1257",
1773    "cp1258",
1774    "cp424",
1775    "cp437",
1776    "cp500",
1777    "cp720",
1778    "cp737",
1779    "cp775",
1780    "cp850",
1781    "cp852",
1782    "cp855",
1783    "cp856",
1784    "cp857",
1785    "cp858",
1786    "cp860",
1787    "cp861",
1788    "cp862",
1789    "cp863",
1790    "cp864",
1791    "cp865",
1792    "cp866",
1793    "cp869",
1794    "cp874",
1795    "cp875",
1796    "cp932",
1797    "cp949",
1798    "cp950",
1799    "euc_jis_2004",
1800    "euc_jisx0213",
1801    "euc_jp",
1802    "euc_kr",
1803    "gb18030",
1804    "gb2312",
1805    "gbk",
1806    "hp_roman8",
1807    "hz",
1808    "idna",
1809    "iso2022_jp",
1810    "iso2022_jp_1",
1811    "iso2022_jp_2",
1812    "iso2022_jp_2004",
1813    "iso2022_jp_3",
1814    "iso2022_jp_ext",
1815    "iso2022_kr",
1816    "iso8859_1",
1817    "iso8859_10",
1818    "iso8859_11",
1819    "iso8859_13",
1820    "iso8859_14",
1821    "iso8859_15",
1822    "iso8859_16",
1823    "iso8859_2",
1824    "iso8859_3",
1825    "iso8859_4",
1826    "iso8859_5",
1827    "iso8859_6",
1828    "iso8859_7",
1829    "iso8859_8",
1830    "iso8859_9",
1831    "johab",
1832    "koi8_r",
1833    "koi8_t",
1834    "koi8_u",
1835    "kz1048",
1836    "latin_1",
1837    "mac_cyrillic",
1838    "mac_greek",
1839    "mac_iceland",
1840    "mac_latin2",
1841    "mac_roman",
1842    "mac_turkish",
1843    "palmos",
1844    "ptcp154",
1845    "punycode",
1846    "raw_unicode_escape",
1847    "shift_jis",
1848    "shift_jis_2004",
1849    "shift_jisx0213",
1850    "tis_620",
1851    "unicode_escape",
1852    "utf_16",
1853    "utf_16_be",
1854    "utf_16_le",
1855    "utf_7",
1856    "utf_8",
1857]
1858
1859if hasattr(codecs, "mbcs_encode"):
1860    all_unicode_encodings.append("mbcs")
1861if hasattr(codecs, "oem_encode"):
1862    all_unicode_encodings.append("oem")
1863
1864# The following encoding is not tested, because it's not supposed
1865# to work:
1866#    "undefined"
1867
1868# The following encodings don't work in stateful mode
1869broken_unicode_with_stateful = [
1870    "punycode",
1871]
1872
1873
1874class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
1875    def test_basics(self):
1876        s = "abc123"  # all codecs should be able to encode these
1877        for encoding in all_unicode_encodings:
1878            name = codecs.lookup(encoding).name
1879            if encoding.endswith("_codec"):
1880                name += "_codec"
1881            elif encoding == "latin_1":
1882                name = "latin_1"
1883            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1884
1885            (b, size) = codecs.getencoder(encoding)(s)
1886            self.assertEqual(size, len(s), "encoding=%r" % encoding)
1887            (chars, size) = codecs.getdecoder(encoding)(b)
1888            self.assertEqual(chars, s, "encoding=%r" % encoding)
1889
1890            if encoding not in broken_unicode_with_stateful:
1891                # check stream reader/writer
1892                q = Queue(b"")
1893                writer = codecs.getwriter(encoding)(q)
1894                encodedresult = b""
1895                for c in s:
1896                    writer.write(c)
1897                    chunk = q.read()
1898                    self.assertTrue(type(chunk) is bytes, type(chunk))
1899                    encodedresult += chunk
1900                q = Queue(b"")
1901                reader = codecs.getreader(encoding)(q)
1902                decodedresult = ""
1903                for c in encodedresult:
1904                    q.write(bytes([c]))
1905                    decodedresult += reader.read()
1906                self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
1907
1908            if encoding not in broken_unicode_with_stateful:
1909                # check incremental decoder/encoder and iterencode()/iterdecode()
1910                try:
1911                    encoder = codecs.getincrementalencoder(encoding)()
1912                except LookupError:  # no IncrementalEncoder
1913                    pass
1914                else:
1915                    # check incremental decoder/encoder
1916                    encodedresult = b""
1917                    for c in s:
1918                        encodedresult += encoder.encode(c)
1919                    encodedresult += encoder.encode("", True)
1920                    decoder = codecs.getincrementaldecoder(encoding)()
1921                    decodedresult = ""
1922                    for c in encodedresult:
1923                        decodedresult += decoder.decode(bytes([c]))
1924                    decodedresult += decoder.decode(b"", True)
1925                    self.assertEqual(decodedresult, s,
1926                                     "encoding=%r" % encoding)
1927
1928                    # check iterencode()/iterdecode()
1929                    result = "".join(codecs.iterdecode(
1930                            codecs.iterencode(s, encoding), encoding))
1931                    self.assertEqual(result, s, "encoding=%r" % encoding)
1932
1933                    # check iterencode()/iterdecode() with empty string
1934                    result = "".join(codecs.iterdecode(
1935                            codecs.iterencode("", encoding), encoding))
1936                    self.assertEqual(result, "")
1937
1938                if encoding not in ("idna", "mbcs"):
1939                    # check incremental decoder/encoder with errors argument
1940                    try:
1941                        encoder = codecs.getincrementalencoder(encoding)("ignore")
1942                    except LookupError:  # no IncrementalEncoder
1943                        pass
1944                    else:
1945                        encodedresult = b"".join(encoder.encode(c) for c in s)
1946                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
1947                        decodedresult = "".join(decoder.decode(bytes([c]))
1948                                                for c in encodedresult)
1949                        self.assertEqual(decodedresult, s,
1950                                         "encoding=%r" % encoding)
1951
1952    @support.cpython_only
1953    def test_basics_capi(self):
1954        s = "abc123"  # all codecs should be able to encode these
1955        for encoding in all_unicode_encodings:
1956            if encoding not in broken_unicode_with_stateful:
1957                # check incremental decoder/encoder (fetched via the C API)
1958                try:
1959                    cencoder = _testcapi.codec_incrementalencoder(encoding)
1960                except LookupError:  # no IncrementalEncoder
1961                    pass
1962                else:
1963                    # check C API
1964                    encodedresult = b""
1965                    for c in s:
1966                        encodedresult += cencoder.encode(c)
1967                    encodedresult += cencoder.encode("", True)
1968                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1969                    decodedresult = ""
1970                    for c in encodedresult:
1971                        decodedresult += cdecoder.decode(bytes([c]))
1972                    decodedresult += cdecoder.decode(b"", True)
1973                    self.assertEqual(decodedresult, s,
1974                                     "encoding=%r" % encoding)
1975
1976                if encoding not in ("idna", "mbcs"):
1977                    # check incremental decoder/encoder with errors argument
1978                    try:
1979                        cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1980                    except LookupError:  # no IncrementalEncoder
1981                        pass
1982                    else:
1983                        encodedresult = b"".join(cencoder.encode(c) for c in s)
1984                        cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1985                        decodedresult = "".join(cdecoder.decode(bytes([c]))
1986                                                for c in encodedresult)
1987                        self.assertEqual(decodedresult, s,
1988                                         "encoding=%r" % encoding)
1989
1990    def test_seek(self):
1991        # all codecs should be able to encode these
1992        s = "%s\n%s\n" % (100*"abc123", 100*"def456")
1993        for encoding in all_unicode_encodings:
1994            if encoding == "idna": # FIXME: See SF bug #1163178
1995                continue
1996            if encoding in broken_unicode_with_stateful:
1997                continue
1998            reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
1999            for t in range(5):
2000                # Test that calling seek resets the internal codec state and buffers
2001                reader.seek(0, 0)
2002                data = reader.read()
2003                self.assertEqual(s, data)
2004
2005    def test_bad_decode_args(self):
2006        for encoding in all_unicode_encodings:
2007            decoder = codecs.getdecoder(encoding)
2008            self.assertRaises(TypeError, decoder)
2009            if encoding not in ("idna", "punycode"):
2010                self.assertRaises(TypeError, decoder, 42)
2011
2012    def test_bad_encode_args(self):
2013        for encoding in all_unicode_encodings:
2014            encoder = codecs.getencoder(encoding)
2015            self.assertRaises(TypeError, encoder)
2016
2017    def test_encoding_map_type_initialized(self):
2018        from encodings import cp1140
2019        # This used to crash, we are only verifying there's no crash.
2020        table_type = type(cp1140.encoding_table)
2021        self.assertEqual(table_type, table_type)
2022
2023    def test_decoder_state(self):
2024        # Check that getstate() and setstate() handle the state properly
2025        u = "abc123"
2026        for encoding in all_unicode_encodings:
2027            if encoding not in broken_unicode_with_stateful:
2028                self.check_state_handling_decode(encoding, u, u.encode(encoding))
2029                self.check_state_handling_encode(encoding, u, u.encode(encoding))
2030
2031
2032class CharmapTest(unittest.TestCase):
2033    def test_decode_with_string_map(self):
2034        self.assertEqual(
2035            codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
2036            ("abc", 3)
2037        )
2038
2039        self.assertEqual(
2040            codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2041            ("\U0010FFFFbc", 3)
2042        )
2043
2044        self.assertRaises(UnicodeDecodeError,
2045            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2046        )
2047
2048        self.assertRaises(UnicodeDecodeError,
2049            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2050        )
2051
2052        self.assertEqual(
2053            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
2054            ("ab\ufffd", 3)
2055        )
2056
2057        self.assertEqual(
2058            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
2059            ("ab\ufffd", 3)
2060        )
2061
2062        self.assertEqual(
2063            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2064            ("ab\\x02", 3)
2065        )
2066
2067        self.assertEqual(
2068            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2069            ("ab\\x02", 3)
2070        )
2071
2072        self.assertEqual(
2073            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
2074            ("ab", 3)
2075        )
2076
2077        self.assertEqual(
2078            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
2079            ("ab", 3)
2080        )
2081
2082        allbytes = bytes(range(256))
2083        self.assertEqual(
2084            codecs.charmap_decode(allbytes, "ignore", ""),
2085            ("", len(allbytes))
2086        )
2087
2088    def test_decode_with_int2str_map(self):
2089        self.assertEqual(
2090            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2091                                  {0: 'a', 1: 'b', 2: 'c'}),
2092            ("abc", 3)
2093        )
2094
2095        self.assertEqual(
2096            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2097                                  {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2098            ("AaBbCc", 3)
2099        )
2100
2101        self.assertEqual(
2102            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103                                  {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2104            ("\U0010FFFFbc", 3)
2105        )
2106
2107        self.assertEqual(
2108            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2109                                  {0: 'a', 1: 'b', 2: ''}),
2110            ("ab", 3)
2111        )
2112
2113        self.assertRaises(UnicodeDecodeError,
2114            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2115                                   {0: 'a', 1: 'b'}
2116        )
2117
2118        self.assertRaises(UnicodeDecodeError,
2119            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2120                                   {0: 'a', 1: 'b', 2: None}
2121        )
2122
2123        # Issue #14850
2124        self.assertRaises(UnicodeDecodeError,
2125            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2126                                   {0: 'a', 1: 'b', 2: '\ufffe'}
2127        )
2128
2129        self.assertEqual(
2130            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2131                                  {0: 'a', 1: 'b'}),
2132            ("ab\ufffd", 3)
2133        )
2134
2135        self.assertEqual(
2136            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2137                                  {0: 'a', 1: 'b', 2: None}),
2138            ("ab\ufffd", 3)
2139        )
2140
2141        # Issue #14850
2142        self.assertEqual(
2143            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2144                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2145            ("ab\ufffd", 3)
2146        )
2147
2148        self.assertEqual(
2149            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2150                                  {0: 'a', 1: 'b'}),
2151            ("ab\\x02", 3)
2152        )
2153
2154        self.assertEqual(
2155            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2156                                  {0: 'a', 1: 'b', 2: None}),
2157            ("ab\\x02", 3)
2158        )
2159
2160        # Issue #14850
2161        self.assertEqual(
2162            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2163                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2164            ("ab\\x02", 3)
2165        )
2166
2167        self.assertEqual(
2168            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2169                                  {0: 'a', 1: 'b'}),
2170            ("ab", 3)
2171        )
2172
2173        self.assertEqual(
2174            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2175                                  {0: 'a', 1: 'b', 2: None}),
2176            ("ab", 3)
2177        )
2178
2179        # Issue #14850
2180        self.assertEqual(
2181            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2182                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2183            ("ab", 3)
2184        )
2185
2186        allbytes = bytes(range(256))
2187        self.assertEqual(
2188            codecs.charmap_decode(allbytes, "ignore", {}),
2189            ("", len(allbytes))
2190        )
2191
2192        self.assertRaisesRegex(TypeError,
2193            "character mapping must be in range\\(0x110000\\)",
2194            codecs.charmap_decode,
2195            b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2}
2196        )
2197
2198        self.assertRaisesRegex(TypeError,
2199            "character mapping must be in range\\(0x110000\\)",
2200            codecs.charmap_decode,
2201            b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999}
2202        )
2203
2204    def test_decode_with_int2int_map(self):
2205        a = ord('a')
2206        b = ord('b')
2207        c = ord('c')
2208
2209        self.assertEqual(
2210            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2211                                  {0: a, 1: b, 2: c}),
2212            ("abc", 3)
2213        )
2214
2215        # Issue #15379
2216        self.assertEqual(
2217            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2218                                  {0: 0x10FFFF, 1: b, 2: c}),
2219            ("\U0010FFFFbc", 3)
2220        )
2221
2222        self.assertEqual(
2223            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2224                                  {0: sys.maxunicode, 1: b, 2: c}),
2225            (chr(sys.maxunicode) + "bc", 3)
2226        )
2227
2228        self.assertRaises(TypeError,
2229            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2230                                   {0: sys.maxunicode + 1, 1: b, 2: c}
2231        )
2232
2233        self.assertRaises(UnicodeDecodeError,
2234            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2235                                   {0: a, 1: b},
2236        )
2237
2238        self.assertRaises(UnicodeDecodeError,
2239            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2240                                   {0: a, 1: b, 2: 0xFFFE},
2241        )
2242
2243        self.assertEqual(
2244            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2245                                  {0: a, 1: b}),
2246            ("ab\ufffd", 3)
2247        )
2248
2249        self.assertEqual(
2250            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2251                                  {0: a, 1: b, 2: 0xFFFE}),
2252            ("ab\ufffd", 3)
2253        )
2254
2255        self.assertEqual(
2256            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2257                                  {0: a, 1: b}),
2258            ("ab\\x02", 3)
2259        )
2260
2261        self.assertEqual(
2262            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2263                                  {0: a, 1: b, 2: 0xFFFE}),
2264            ("ab\\x02", 3)
2265        )
2266
2267        self.assertEqual(
2268            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2269                                  {0: a, 1: b}),
2270            ("ab", 3)
2271        )
2272
2273        self.assertEqual(
2274            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2275                                  {0: a, 1: b, 2: 0xFFFE}),
2276            ("ab", 3)
2277        )
2278
2279
2280class WithStmtTest(unittest.TestCase):
2281    def test_encodedfile(self):
2282        f = io.BytesIO(b"\xc3\xbc")
2283        with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2284            self.assertEqual(ef.read(), b"\xfc")
2285        self.assertTrue(f.closed)
2286
2287    def test_streamreaderwriter(self):
2288        f = io.BytesIO(b"\xc3\xbc")
2289        info = codecs.lookup("utf-8")
2290        with codecs.StreamReaderWriter(f, info.streamreader,
2291                                       info.streamwriter, 'strict') as srw:
2292            self.assertEqual(srw.read(), "\xfc")
2293
2294
2295class TypesTest(unittest.TestCase):
2296    def test_decode_unicode(self):
2297        # Most decoders don't accept unicode input
2298        decoders = [
2299            codecs.utf_7_decode,
2300            codecs.utf_8_decode,
2301            codecs.utf_16_le_decode,
2302            codecs.utf_16_be_decode,
2303            codecs.utf_16_ex_decode,
2304            codecs.utf_32_decode,
2305            codecs.utf_32_le_decode,
2306            codecs.utf_32_be_decode,
2307            codecs.utf_32_ex_decode,
2308            codecs.latin_1_decode,
2309            codecs.ascii_decode,
2310            codecs.charmap_decode,
2311        ]
2312        if hasattr(codecs, "mbcs_decode"):
2313            decoders.append(codecs.mbcs_decode)
2314        for decoder in decoders:
2315            self.assertRaises(TypeError, decoder, "xxx")
2316
2317    def test_unicode_escape(self):
2318        # Escape-decoding a unicode string is supported and gives the same
2319        # result as decoding the equivalent ASCII bytes string.
2320        self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2321        self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2322        self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2323        self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2324
2325        self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2326        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2327        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2328                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2329
2330        self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2331        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2332        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2333                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2334
2335
2336class UnicodeEscapeTest(unittest.TestCase):
2337    def test_empty(self):
2338        self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2339        self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2340
2341    def test_raw_encode(self):
2342        encode = codecs.unicode_escape_encode
2343        for b in range(32, 127):
2344            if b != b'\\'[0]:
2345                self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2346
2347    def test_raw_decode(self):
2348        decode = codecs.unicode_escape_decode
2349        for b in range(256):
2350            if b != b'\\'[0]:
2351                self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2352
2353    def test_escape_encode(self):
2354        encode = codecs.unicode_escape_encode
2355        check = coding_checker(self, encode)
2356        check('\t', br'\t')
2357        check('\n', br'\n')
2358        check('\r', br'\r')
2359        check('\\', br'\\')
2360        for b in range(32):
2361            if chr(b) not in '\t\n\r':
2362                check(chr(b), ('\\x%02x' % b).encode())
2363        for b in range(127, 256):
2364            check(chr(b), ('\\x%02x' % b).encode())
2365        check('\u20ac', br'\u20ac')
2366        check('\U0001d120', br'\U0001d120')
2367
2368    def test_escape_decode(self):
2369        decode = codecs.unicode_escape_decode
2370        check = coding_checker(self, decode)
2371        check(b"[\\\n]", "[]")
2372        check(br'[\"]', '["]')
2373        check(br"[\']", "[']")
2374        check(br"[\\]", r"[\]")
2375        check(br"[\a]", "[\x07]")
2376        check(br"[\b]", "[\x08]")
2377        check(br"[\t]", "[\x09]")
2378        check(br"[\n]", "[\x0a]")
2379        check(br"[\v]", "[\x0b]")
2380        check(br"[\f]", "[\x0c]")
2381        check(br"[\r]", "[\x0d]")
2382        check(br"[\7]", "[\x07]")
2383        check(br"[\78]", "[\x078]")
2384        check(br"[\41]", "[!]")
2385        check(br"[\418]", "[!8]")
2386        check(br"[\101]", "[A]")
2387        check(br"[\1010]", "[A0]")
2388        check(br"[\x41]", "[A]")
2389        check(br"[\x410]", "[A0]")
2390        check(br"\u20ac", "\u20ac")
2391        check(br"\U0001d120", "\U0001d120")
2392        for i in range(97, 123):
2393            b = bytes([i])
2394            if b not in b'abfnrtuvx':
2395                with self.assertWarns(DeprecationWarning):
2396                    check(b"\\" + b, "\\" + chr(i))
2397            if b.upper() not in b'UN':
2398                with self.assertWarns(DeprecationWarning):
2399                    check(b"\\" + b.upper(), "\\" + chr(i-32))
2400        with self.assertWarns(DeprecationWarning):
2401            check(br"\8", "\\8")
2402        with self.assertWarns(DeprecationWarning):
2403            check(br"\9", "\\9")
2404        with self.assertWarns(DeprecationWarning):
2405            check(b"\\\xfa", "\\\xfa")
2406
2407    def test_decode_errors(self):
2408        decode = codecs.unicode_escape_decode
2409        for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2410            for i in range(d):
2411                self.assertRaises(UnicodeDecodeError, decode,
2412                                  b"\\" + c + b"0"*i)
2413                self.assertRaises(UnicodeDecodeError, decode,
2414                                  b"[\\" + c + b"0"*i + b"]")
2415                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2416                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2417                self.assertEqual(decode(data, "replace"),
2418                                 ("[\ufffd]\ufffd", len(data)))
2419        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2420        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2421        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2422
2423
2424class RawUnicodeEscapeTest(unittest.TestCase):
2425    def test_empty(self):
2426        self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2427        self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2428
2429    def test_raw_encode(self):
2430        encode = codecs.raw_unicode_escape_encode
2431        for b in range(256):
2432            self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2433
2434    def test_raw_decode(self):
2435        decode = codecs.raw_unicode_escape_decode
2436        for b in range(256):
2437            self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2438
2439    def test_escape_encode(self):
2440        encode = codecs.raw_unicode_escape_encode
2441        check = coding_checker(self, encode)
2442        for b in range(256):
2443            if b not in b'uU':
2444                check('\\' + chr(b), b'\\' + bytes([b]))
2445        check('\u20ac', br'\u20ac')
2446        check('\U0001d120', br'\U0001d120')
2447
2448    def test_escape_decode(self):
2449        decode = codecs.raw_unicode_escape_decode
2450        check = coding_checker(self, decode)
2451        for b in range(256):
2452            if b not in b'uU':
2453                check(b'\\' + bytes([b]), '\\' + chr(b))
2454        check(br"\u20ac", "\u20ac")
2455        check(br"\U0001d120", "\U0001d120")
2456
2457    def test_decode_errors(self):
2458        decode = codecs.raw_unicode_escape_decode
2459        for c, d in (b'u', 4), (b'U', 4):
2460            for i in range(d):
2461                self.assertRaises(UnicodeDecodeError, decode,
2462                                  b"\\" + c + b"0"*i)
2463                self.assertRaises(UnicodeDecodeError, decode,
2464                                  b"[\\" + c + b"0"*i + b"]")
2465                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2466                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2467                self.assertEqual(decode(data, "replace"),
2468                                 ("[\ufffd]\ufffd", len(data)))
2469        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2470        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2471        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2472
2473
2474class EscapeEncodeTest(unittest.TestCase):
2475
2476    def test_escape_encode(self):
2477        tests = [
2478            (b'', (b'', 0)),
2479            (b'foobar', (b'foobar', 6)),
2480            (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2481            (b'a\'b', (b"a\\'b", 3)),
2482            (b'b\\c', (b'b\\\\c', 3)),
2483            (b'c\nd', (b'c\\nd', 3)),
2484            (b'd\re', (b'd\\re', 3)),
2485            (b'f\x7fg', (b'f\\x7fg', 3)),
2486        ]
2487        for data, output in tests:
2488            with self.subTest(data=data):
2489                self.assertEqual(codecs.escape_encode(data), output)
2490        self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2491        self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2492
2493
2494class SurrogateEscapeTest(unittest.TestCase):
2495
2496    def test_utf8(self):
2497        # Bad byte
2498        self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
2499                         "foo\udc80bar")
2500        self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
2501                         b"foo\x80bar")
2502        # bad-utf-8 encoded surrogate
2503        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
2504                         "\udced\udcb0\udc80")
2505        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
2506                         b"\xed\xb0\x80")
2507
2508    def test_ascii(self):
2509        # bad byte
2510        self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
2511                         "foo\udc80bar")
2512        self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
2513                         b"foo\x80bar")
2514
2515    def test_charmap(self):
2516        # bad byte: \xa5 is unmapped in iso-8859-3
2517        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
2518                         "foo\udca5bar")
2519        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
2520                         b"foo\xa5bar")
2521
2522    def test_latin1(self):
2523        # Issue6373
2524        self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
2525                         b"\xe4\xeb\xef\xf6\xfc")
2526
2527
2528class BomTest(unittest.TestCase):
2529    def test_seek0(self):
2530        data = "1234567890"
2531        tests = ("utf-16",
2532                 "utf-16-le",
2533                 "utf-16-be",
2534                 "utf-32",
2535                 "utf-32-le",
2536                 "utf-32-be")
2537        self.addCleanup(support.unlink, support.TESTFN)
2538        for encoding in tests:
2539            # Check if the BOM is written only once
2540            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2541                f.write(data)
2542                f.write(data)
2543                f.seek(0)
2544                self.assertEqual(f.read(), data * 2)
2545                f.seek(0)
2546                self.assertEqual(f.read(), data * 2)
2547
2548            # Check that the BOM is written after a seek(0)
2549            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2550                f.write(data[0])
2551                self.assertNotEqual(f.tell(), 0)
2552                f.seek(0)
2553                f.write(data)
2554                f.seek(0)
2555                self.assertEqual(f.read(), data)
2556
2557            # (StreamWriter) Check that the BOM is written after a seek(0)
2558            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2559                f.writer.write(data[0])
2560                self.assertNotEqual(f.writer.tell(), 0)
2561                f.writer.seek(0)
2562                f.writer.write(data)
2563                f.seek(0)
2564                self.assertEqual(f.read(), data)
2565
2566            # Check that the BOM is not written after a seek() at a position
2567            # different than the start
2568            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2569                f.write(data)
2570                f.seek(f.tell())
2571                f.write(data)
2572                f.seek(0)
2573                self.assertEqual(f.read(), data * 2)
2574
2575            # (StreamWriter) Check that the BOM is not written after a seek()
2576            # at a position different than the start
2577            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2578                f.writer.write(data)
2579                f.writer.seek(f.writer.tell())
2580                f.writer.write(data)
2581                f.seek(0)
2582                self.assertEqual(f.read(), data * 2)
2583
2584
2585bytes_transform_encodings = [
2586    "base64_codec",
2587    "uu_codec",
2588    "quopri_codec",
2589    "hex_codec",
2590]
2591
2592transform_aliases = {
2593    "base64_codec": ["base64", "base_64"],
2594    "uu_codec": ["uu"],
2595    "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2596    "hex_codec": ["hex"],
2597    "rot_13": ["rot13"],
2598}
2599
2600try:
2601    import zlib
2602except ImportError:
2603    zlib = None
2604else:
2605    bytes_transform_encodings.append("zlib_codec")
2606    transform_aliases["zlib_codec"] = ["zip", "zlib"]
2607try:
2608    import bz2
2609except ImportError:
2610    pass
2611else:
2612    bytes_transform_encodings.append("bz2_codec")
2613    transform_aliases["bz2_codec"] = ["bz2"]
2614
2615
2616class TransformCodecTest(unittest.TestCase):
2617
2618    def test_basics(self):
2619        binput = bytes(range(256))
2620        for encoding in bytes_transform_encodings:
2621            with self.subTest(encoding=encoding):
2622                # generic codecs interface
2623                (o, size) = codecs.getencoder(encoding)(binput)
2624                self.assertEqual(size, len(binput))
2625                (i, size) = codecs.getdecoder(encoding)(o)
2626                self.assertEqual(size, len(o))
2627                self.assertEqual(i, binput)
2628
2629    def test_read(self):
2630        for encoding in bytes_transform_encodings:
2631            with self.subTest(encoding=encoding):
2632                sin = codecs.encode(b"\x80", encoding)
2633                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2634                sout = reader.read()
2635                self.assertEqual(sout, b"\x80")
2636
2637    def test_readline(self):
2638        for encoding in bytes_transform_encodings:
2639            with self.subTest(encoding=encoding):
2640                sin = codecs.encode(b"\x80", encoding)
2641                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2642                sout = reader.readline()
2643                self.assertEqual(sout, b"\x80")
2644
2645    def test_buffer_api_usage(self):
2646        # We check all the transform codecs accept memoryview input
2647        # for encoding and decoding
2648        # and also that they roundtrip correctly
2649        original = b"12345\x80"
2650        for encoding in bytes_transform_encodings:
2651            with self.subTest(encoding=encoding):
2652                data = original
2653                view = memoryview(data)
2654                data = codecs.encode(data, encoding)
2655                view_encoded = codecs.encode(view, encoding)
2656                self.assertEqual(view_encoded, data)
2657                view = memoryview(data)
2658                data = codecs.decode(data, encoding)
2659                self.assertEqual(data, original)
2660                view_decoded = codecs.decode(view, encoding)
2661                self.assertEqual(view_decoded, data)
2662
2663    def test_text_to_binary_blacklists_binary_transforms(self):
2664        # Check binary -> binary codecs give a good error for str input
2665        bad_input = "bad input type"
2666        for encoding in bytes_transform_encodings:
2667            with self.subTest(encoding=encoding):
2668                fmt = (r"{!r} is not a text encoding; "
2669                       r"use codecs.encode\(\) to handle arbitrary codecs")
2670                msg = fmt.format(encoding)
2671                with self.assertRaisesRegex(LookupError, msg) as failure:
2672                    bad_input.encode(encoding)
2673                self.assertIsNone(failure.exception.__cause__)
2674
2675    def test_text_to_binary_blacklists_text_transforms(self):
2676        # Check str.encode gives a good error message for str -> str codecs
2677        msg = (r"^'rot_13' is not a text encoding; "
2678               r"use codecs.encode\(\) to handle arbitrary codecs")
2679        with self.assertRaisesRegex(LookupError, msg):
2680            "just an example message".encode("rot_13")
2681
2682    def test_binary_to_text_blacklists_binary_transforms(self):
2683        # Check bytes.decode and bytearray.decode give a good error
2684        # message for binary -> binary codecs
2685        data = b"encode first to ensure we meet any format restrictions"
2686        for encoding in bytes_transform_encodings:
2687            with self.subTest(encoding=encoding):
2688                encoded_data = codecs.encode(data, encoding)
2689                fmt = (r"{!r} is not a text encoding; "
2690                       r"use codecs.decode\(\) to handle arbitrary codecs")
2691                msg = fmt.format(encoding)
2692                with self.assertRaisesRegex(LookupError, msg):
2693                    encoded_data.decode(encoding)
2694                with self.assertRaisesRegex(LookupError, msg):
2695                    bytearray(encoded_data).decode(encoding)
2696
2697    def test_binary_to_text_blacklists_text_transforms(self):
2698        # Check str -> str codec gives a good error for binary input
2699        for bad_input in (b"immutable", bytearray(b"mutable")):
2700            with self.subTest(bad_input=bad_input):
2701                msg = (r"^'rot_13' is not a text encoding; "
2702                       r"use codecs.decode\(\) to handle arbitrary codecs")
2703                with self.assertRaisesRegex(LookupError, msg) as failure:
2704                    bad_input.decode("rot_13")
2705                self.assertIsNone(failure.exception.__cause__)
2706
2707    @unittest.skipUnless(zlib, "Requires zlib support")
2708    def test_custom_zlib_error_is_wrapped(self):
2709        # Check zlib codec gives a good error for malformed input
2710        msg = "^decoding with 'zlib_codec' codec failed"
2711        with self.assertRaisesRegex(Exception, msg) as failure:
2712            codecs.decode(b"hello", "zlib_codec")
2713        self.assertIsInstance(failure.exception.__cause__,
2714                                                type(failure.exception))
2715
2716    def test_custom_hex_error_is_wrapped(self):
2717        # Check hex codec gives a good error for malformed input
2718        msg = "^decoding with 'hex_codec' codec failed"
2719        with self.assertRaisesRegex(Exception, msg) as failure:
2720            codecs.decode(b"hello", "hex_codec")
2721        self.assertIsInstance(failure.exception.__cause__,
2722                                                type(failure.exception))
2723
2724    # Unfortunately, the bz2 module throws OSError, which the codec
2725    # machinery currently can't wrap :(
2726
2727    # Ensure codec aliases from http://bugs.python.org/issue7475 work
2728    def test_aliases(self):
2729        for codec_name, aliases in transform_aliases.items():
2730            expected_name = codecs.lookup(codec_name).name
2731            for alias in aliases:
2732                with self.subTest(alias=alias):
2733                    info = codecs.lookup(alias)
2734                    self.assertEqual(info.name, expected_name)
2735
2736    def test_quopri_stateless(self):
2737        # Should encode with quotetabs=True
2738        encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2739        self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2740        # But should still support unescaped tabs and spaces
2741        unescaped = b"space tab eol\n"
2742        self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2743
2744    def test_uu_invalid(self):
2745        # Missing "begin" line
2746        self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2747
2748
2749# The codec system tries to wrap exceptions in order to ensure the error
2750# mentions the operation being performed and the codec involved. We
2751# currently *only* want this to happen for relatively stateless
2752# exceptions, where the only significant information they contain is their
2753# type and a single str argument.
2754
2755# Use a local codec registry to avoid appearing to leak objects when
2756# registering multiple search functions
2757_TEST_CODECS = {}
2758
2759def _get_test_codec(codec_name):
2760    return _TEST_CODECS.get(codec_name)
2761codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2762
2763try:
2764    # Issue #22166: Also need to clear the internal cache in CPython
2765    from _codecs import _forget_codec
2766except ImportError:
2767    def _forget_codec(codec_name):
2768        pass
2769
2770
2771class ExceptionChainingTest(unittest.TestCase):
2772
2773    def setUp(self):
2774        # There's no way to unregister a codec search function, so we just
2775        # ensure we render this one fairly harmless after the test
2776        # case finishes by using the test case repr as the codec name
2777        # The codecs module normalizes codec names, although this doesn't
2778        # appear to be formally documented...
2779        # We also make sure we use a truly unique id for the custom codec
2780        # to avoid issues with the codec cache when running these tests
2781        # multiple times (e.g. when hunting for refleaks)
2782        unique_id = repr(self) + str(id(self))
2783        self.codec_name = encodings.normalize_encoding(unique_id).lower()
2784
2785        # We store the object to raise on the instance because of a bad
2786        # interaction between the codec caching (which means we can't
2787        # recreate the codec entry) and regrtest refleak hunting (which
2788        # runs the same test instance multiple times). This means we
2789        # need to ensure the codecs call back in to the instance to find
2790        # out which exception to raise rather than binding them in a
2791        # closure to an object that may change on the next run
2792        self.obj_to_raise = RuntimeError
2793
2794    def tearDown(self):
2795        _TEST_CODECS.pop(self.codec_name, None)
2796        # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2797        encodings._cache.pop(self.codec_name, None)
2798        try:
2799            _forget_codec(self.codec_name)
2800        except KeyError:
2801            pass
2802
2803    def set_codec(self, encode, decode):
2804        codec_info = codecs.CodecInfo(encode, decode,
2805                                      name=self.codec_name)
2806        _TEST_CODECS[self.codec_name] = codec_info
2807
2808    @contextlib.contextmanager
2809    def assertWrapped(self, operation, exc_type, msg):
2810        full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
2811                  operation, self.codec_name, exc_type.__name__, msg)
2812        with self.assertRaisesRegex(exc_type, full_msg) as caught:
2813            yield caught
2814        self.assertIsInstance(caught.exception.__cause__, exc_type)
2815        self.assertIsNotNone(caught.exception.__cause__.__traceback__)
2816
2817    def raise_obj(self, *args, **kwds):
2818        # Helper to dynamically change the object raised by a test codec
2819        raise self.obj_to_raise
2820
2821    def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2822        self.obj_to_raise = obj_to_raise
2823        self.set_codec(self.raise_obj, self.raise_obj)
2824        with self.assertWrapped("encoding", exc_type, msg):
2825            "str_input".encode(self.codec_name)
2826        with self.assertWrapped("encoding", exc_type, msg):
2827            codecs.encode("str_input", self.codec_name)
2828        with self.assertWrapped("decoding", exc_type, msg):
2829            b"bytes input".decode(self.codec_name)
2830        with self.assertWrapped("decoding", exc_type, msg):
2831            codecs.decode(b"bytes input", self.codec_name)
2832
2833    def test_raise_by_type(self):
2834        self.check_wrapped(RuntimeError, "")
2835
2836    def test_raise_by_value(self):
2837        msg = "This should be wrapped"
2838        self.check_wrapped(RuntimeError(msg), msg)
2839
2840    def test_raise_grandchild_subclass_exact_size(self):
2841        msg = "This should be wrapped"
2842        class MyRuntimeError(RuntimeError):
2843            __slots__ = ()
2844        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2845
2846    def test_raise_subclass_with_weakref_support(self):
2847        msg = "This should be wrapped"
2848        class MyRuntimeError(RuntimeError):
2849            pass
2850        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2851
2852    def check_not_wrapped(self, obj_to_raise, msg):
2853        def raise_obj(*args, **kwds):
2854            raise obj_to_raise
2855        self.set_codec(raise_obj, raise_obj)
2856        with self.assertRaisesRegex(RuntimeError, msg):
2857            "str input".encode(self.codec_name)
2858        with self.assertRaisesRegex(RuntimeError, msg):
2859            codecs.encode("str input", self.codec_name)
2860        with self.assertRaisesRegex(RuntimeError, msg):
2861            b"bytes input".decode(self.codec_name)
2862        with self.assertRaisesRegex(RuntimeError, msg):
2863            codecs.decode(b"bytes input", self.codec_name)
2864
2865    def test_init_override_is_not_wrapped(self):
2866        class CustomInit(RuntimeError):
2867            def __init__(self):
2868                pass
2869        self.check_not_wrapped(CustomInit, "")
2870
2871    def test_new_override_is_not_wrapped(self):
2872        class CustomNew(RuntimeError):
2873            def __new__(cls):
2874                return super().__new__(cls)
2875        self.check_not_wrapped(CustomNew, "")
2876
2877    def test_instance_attribute_is_not_wrapped(self):
2878        msg = "This should NOT be wrapped"
2879        exc = RuntimeError(msg)
2880        exc.attr = 1
2881        self.check_not_wrapped(exc, "^{}$".format(msg))
2882
2883    def test_non_str_arg_is_not_wrapped(self):
2884        self.check_not_wrapped(RuntimeError(1), "1")
2885
2886    def test_multiple_args_is_not_wrapped(self):
2887        msg_re = r"^\('a', 'b', 'c'\)$"
2888        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
2889
2890    # http://bugs.python.org/issue19609
2891    def test_codec_lookup_failure_not_wrapped(self):
2892        msg = "^unknown encoding: {}$".format(self.codec_name)
2893        # The initial codec lookup should not be wrapped
2894        with self.assertRaisesRegex(LookupError, msg):
2895            "str input".encode(self.codec_name)
2896        with self.assertRaisesRegex(LookupError, msg):
2897            codecs.encode("str input", self.codec_name)
2898        with self.assertRaisesRegex(LookupError, msg):
2899            b"bytes input".decode(self.codec_name)
2900        with self.assertRaisesRegex(LookupError, msg):
2901            codecs.decode(b"bytes input", self.codec_name)
2902
2903    def test_unflagged_non_text_codec_handling(self):
2904        # The stdlib non-text codecs are now marked so they're
2905        # pre-emptively skipped by the text model related methods
2906        # However, third party codecs won't be flagged, so we still make
2907        # sure the case where an inappropriate output type is produced is
2908        # handled appropriately
2909        def encode_to_str(*args, **kwds):
2910            return "not bytes!", 0
2911        def decode_to_bytes(*args, **kwds):
2912            return b"not str!", 0
2913        self.set_codec(encode_to_str, decode_to_bytes)
2914        # No input or output type checks on the codecs module functions
2915        encoded = codecs.encode(None, self.codec_name)
2916        self.assertEqual(encoded, "not bytes!")
2917        decoded = codecs.decode(None, self.codec_name)
2918        self.assertEqual(decoded, b"not str!")
2919        # Text model methods should complain
2920        fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2921               r"use codecs.encode\(\) to encode to arbitrary types$")
2922        msg = fmt.format(self.codec_name)
2923        with self.assertRaisesRegex(TypeError, msg):
2924            "str_input".encode(self.codec_name)
2925        fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2926               r"use codecs.decode\(\) to decode to arbitrary types$")
2927        msg = fmt.format(self.codec_name)
2928        with self.assertRaisesRegex(TypeError, msg):
2929            b"bytes input".decode(self.codec_name)
2930
2931
2932
2933@unittest.skipUnless(sys.platform == 'win32',
2934                     'code pages are specific to Windows')
2935class CodePageTest(unittest.TestCase):
2936    CP_UTF8 = 65001
2937
2938    def test_invalid_code_page(self):
2939        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2940        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2941        self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2942        self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
2943
2944    def test_code_page_name(self):
2945        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2946            codecs.code_page_encode, 932, '\xff')
2947        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2948            codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
2949        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2950            codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
2951
2952    def check_decode(self, cp, tests):
2953        for raw, errors, expected in tests:
2954            if expected is not None:
2955                try:
2956                    decoded = codecs.code_page_decode(cp, raw, errors, True)
2957                except UnicodeDecodeError as err:
2958                    self.fail('Unable to decode %a from "cp%s" with '
2959                              'errors=%r: %s' % (raw, cp, errors, err))
2960                self.assertEqual(decoded[0], expected,
2961                    '%a.decode("cp%s", %r)=%a != %a'
2962                    % (raw, cp, errors, decoded[0], expected))
2963                # assert 0 <= decoded[1] <= len(raw)
2964                self.assertGreaterEqual(decoded[1], 0)
2965                self.assertLessEqual(decoded[1], len(raw))
2966            else:
2967                self.assertRaises(UnicodeDecodeError,
2968                    codecs.code_page_decode, cp, raw, errors, True)
2969
2970    def check_encode(self, cp, tests):
2971        for text, errors, expected in tests:
2972            if expected is not None:
2973                try:
2974                    encoded = codecs.code_page_encode(cp, text, errors)
2975                except UnicodeEncodeError as err:
2976                    self.fail('Unable to encode %a to "cp%s" with '
2977                              'errors=%r: %s' % (text, cp, errors, err))
2978                self.assertEqual(encoded[0], expected,
2979                    '%a.encode("cp%s", %r)=%a != %a'
2980                    % (text, cp, errors, encoded[0], expected))
2981                self.assertEqual(encoded[1], len(text))
2982            else:
2983                self.assertRaises(UnicodeEncodeError,
2984                    codecs.code_page_encode, cp, text, errors)
2985
2986    def test_cp932(self):
2987        self.check_encode(932, (
2988            ('abc', 'strict', b'abc'),
2989            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
2990            # test error handlers
2991            ('\xff', 'strict', None),
2992            ('[\xff]', 'ignore', b'[]'),
2993            ('[\xff]', 'replace', b'[y]'),
2994            ('[\u20ac]', 'replace', b'[?]'),
2995            ('[\xff]', 'backslashreplace', b'[\\xff]'),
2996            ('[\xff]', 'namereplace',
2997             b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
2998            ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
2999            ('\udcff', 'strict', None),
3000            ('[\udcff]', 'surrogateescape', b'[\xff]'),
3001            ('[\udcff]', 'surrogatepass', None),
3002        ))
3003        self.check_decode(932, (
3004            (b'abc', 'strict', 'abc'),
3005            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3006            # invalid bytes
3007            (b'[\xff]', 'strict', None),
3008            (b'[\xff]', 'ignore', '[]'),
3009            (b'[\xff]', 'replace', '[\ufffd]'),
3010            (b'[\xff]', 'backslashreplace', '[\\xff]'),
3011            (b'[\xff]', 'surrogateescape', '[\udcff]'),
3012            (b'[\xff]', 'surrogatepass', None),
3013            (b'\x81\x00abc', 'strict', None),
3014            (b'\x81\x00abc', 'ignore', '\x00abc'),
3015            (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
3016            (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
3017        ))
3018
3019    def test_cp1252(self):
3020        self.check_encode(1252, (
3021            ('abc', 'strict', b'abc'),
3022            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
3023            ('\xff', 'strict', b'\xff'),
3024            # test error handlers
3025            ('\u0141', 'strict', None),
3026            ('\u0141', 'ignore', b''),
3027            ('\u0141', 'replace', b'L'),
3028            ('\udc98', 'surrogateescape', b'\x98'),
3029            ('\udc98', 'surrogatepass', None),
3030        ))
3031        self.check_decode(1252, (
3032            (b'abc', 'strict', 'abc'),
3033            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3034            (b'\xff', 'strict', '\xff'),
3035        ))
3036
3037    def test_cp_utf7(self):
3038        cp = 65000
3039        self.check_encode(cp, (
3040            ('abc', 'strict', b'abc'),
3041            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
3042            ('\U0010ffff', 'strict',  b'+2//f/w-'),
3043            ('\udc80', 'strict', b'+3IA-'),
3044            ('\ufffd', 'strict', b'+//0-'),
3045        ))
3046        self.check_decode(cp, (
3047            (b'abc', 'strict', 'abc'),
3048            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3049            (b'+2//f/w-', 'strict', '\U0010ffff'),
3050            (b'+3IA-', 'strict', '\udc80'),
3051            (b'+//0-', 'strict', '\ufffd'),
3052            # invalid bytes
3053            (b'[+/]', 'strict', '[]'),
3054            (b'[\xff]', 'strict', '[\xff]'),
3055        ))
3056
3057    def test_multibyte_encoding(self):
3058        self.check_decode(932, (
3059            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3060            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3061        ))
3062        self.check_decode(self.CP_UTF8, (
3063            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3064            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3065        ))
3066        self.check_encode(self.CP_UTF8, (
3067            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3068            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3069        ))
3070
3071    def test_code_page_decode_flags(self):
3072        # Issue #36312: For some code pages (e.g. UTF-7) flags for
3073        # MultiByteToWideChar() must be set to 0.
3074        if support.verbose:
3075            sys.stdout.write('\n')
3076        for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3077                   *range(57002, 57011+1), 65000):
3078            # On small versions of Windows like Windows IoT
3079            # not all codepages are present.
3080            # A missing codepage causes an OSError exception
3081            # so check for the codepage before decoding
3082            if is_code_page_present(cp):
3083                self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3084            else:
3085                if support.verbose:
3086                    print(f"  skipping cp={cp}")
3087        self.assertEqual(codecs.code_page_decode(42, b'abc'),
3088                         ('\uf061\uf062\uf063', 3))
3089
3090    def test_incremental(self):
3091        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3092        self.assertEqual(decoded, ('', 0))
3093
3094        decoded = codecs.code_page_decode(932,
3095                                          b'\xe9\x80\xe9', 'strict',
3096                                          False)
3097        self.assertEqual(decoded, ('\u9a3e', 2))
3098
3099        decoded = codecs.code_page_decode(932,
3100                                          b'\xe9\x80\xe9\x80', 'strict',
3101                                          False)
3102        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3103
3104        decoded = codecs.code_page_decode(932,
3105                                          b'abc', 'strict',
3106                                          False)
3107        self.assertEqual(decoded, ('abc', 3))
3108
3109    def test_mbcs_alias(self):
3110        # Check that looking up our 'default' codepage will return
3111        # mbcs when we don't have a more specific one available
3112        with mock.patch('_winapi.GetACP', return_value=123):
3113            codec = codecs.lookup('cp123')
3114            self.assertEqual(codec.name, 'mbcs')
3115
3116    @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3117    def test_large_input(self, size):
3118        # Test input longer than INT_MAX.
3119        # Input should contain undecodable bytes before and after
3120        # the INT_MAX limit.
3121        encoded = (b'01234567' * ((size//8)-1) +
3122                   b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3123        self.assertEqual(len(encoded), size+2)
3124        decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3125        self.assertEqual(decoded[1], len(encoded))
3126        del encoded
3127        self.assertEqual(len(decoded[0]), decoded[1])
3128        self.assertEqual(decoded[0][:10], '0123456701')
3129        self.assertEqual(decoded[0][-20:],
3130                         '6701234567'
3131                         '\udc85\udc86\udcea\udceb\udcec'
3132                         '\udcef\udcfc\udcfd\udcfe\udcff')
3133
3134    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3135    def test_large_utf8_input(self, size):
3136        # Test input longer than INT_MAX.
3137        # Input should contain a decodable multi-byte character
3138        # surrounding INT_MAX
3139        encoded = (b'0123456\xed\x84\x80' * (size//8))
3140        self.assertEqual(len(encoded), size // 8 * 10)
3141        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3142        self.assertEqual(decoded[1], len(encoded))
3143        del encoded
3144        self.assertEqual(len(decoded[0]), size)
3145        self.assertEqual(decoded[0][:10], '0123456\ud10001')
3146        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3147
3148
3149class ASCIITest(unittest.TestCase):
3150    def test_encode(self):
3151        self.assertEqual('abc123'.encode('ascii'), b'abc123')
3152
3153    def test_encode_error(self):
3154        for data, error_handler, expected in (
3155            ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3156            ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3157            ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3158            ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3159             b'[\\x80\\xff\\u20ac\\U000abcde]'),
3160            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3161        ):
3162            with self.subTest(data=data, error_handler=error_handler,
3163                              expected=expected):
3164                self.assertEqual(data.encode('ascii', error_handler),
3165                                 expected)
3166
3167    def test_encode_surrogateescape_error(self):
3168        with self.assertRaises(UnicodeEncodeError):
3169            # the first character can be decoded, but not the second
3170            '\udc80\xff'.encode('ascii', 'surrogateescape')
3171
3172    def test_decode(self):
3173        self.assertEqual(b'abc'.decode('ascii'), 'abc')
3174
3175    def test_decode_error(self):
3176        for data, error_handler, expected in (
3177            (b'[\x80\xff]', 'ignore', '[]'),
3178            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3179            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3180            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3181        ):
3182            with self.subTest(data=data, error_handler=error_handler,
3183                              expected=expected):
3184                self.assertEqual(data.decode('ascii', error_handler),
3185                                 expected)
3186
3187
3188class Latin1Test(unittest.TestCase):
3189    def test_encode(self):
3190        for data, expected in (
3191            ('abc', b'abc'),
3192            ('\x80\xe9\xff', b'\x80\xe9\xff'),
3193        ):
3194            with self.subTest(data=data, expected=expected):
3195                self.assertEqual(data.encode('latin1'), expected)
3196
3197    def test_encode_errors(self):
3198        for data, error_handler, expected in (
3199            ('[\u20ac\udc80]', 'ignore', b'[]'),
3200            ('[\u20ac\udc80]', 'replace', b'[??]'),
3201            ('[\u20ac\U000abcde]', 'backslashreplace',
3202             b'[\\u20ac\\U000abcde]'),
3203            ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3204            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3205        ):
3206            with self.subTest(data=data, error_handler=error_handler,
3207                              expected=expected):
3208                self.assertEqual(data.encode('latin1', error_handler),
3209                                 expected)
3210
3211    def test_encode_surrogateescape_error(self):
3212        with self.assertRaises(UnicodeEncodeError):
3213            # the first character can be decoded, but not the second
3214            '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3215
3216    def test_decode(self):
3217        for data, expected in (
3218            (b'abc', 'abc'),
3219            (b'[\x80\xff]', '[\x80\xff]'),
3220        ):
3221            with self.subTest(data=data, expected=expected):
3222                self.assertEqual(data.decode('latin1'), expected)
3223
3224
3225class StreamRecoderTest(unittest.TestCase):
3226    def test_writelines(self):
3227        bio = io.BytesIO()
3228        codec = codecs.lookup('ascii')
3229        sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3230                                  encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3231        sr.writelines([b'a', b'b'])
3232        self.assertEqual(bio.getvalue(), b'ab')
3233
3234    def test_write(self):
3235        bio = io.BytesIO()
3236        codec = codecs.lookup('latin1')
3237        # Recode from Latin-1 to utf-8.
3238        sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3239                                  encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3240
3241        text = 'àñé'
3242        sr.write(text.encode('latin1'))
3243        self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3244
3245    def test_seeking_read(self):
3246        bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3247        sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3248
3249        self.assertEqual(sr.readline(), b'line1\n')
3250        sr.seek(0)
3251        self.assertEqual(sr.readline(), b'line1\n')
3252        self.assertEqual(sr.readline(), b'line2\n')
3253        self.assertEqual(sr.readline(), b'line3\n')
3254        self.assertEqual(sr.readline(), b'')
3255
3256    def test_seeking_write(self):
3257        bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3258        sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3259
3260        # Test that seek() only resets its internal buffer when offset
3261        # and whence are zero.
3262        sr.seek(2)
3263        sr.write(b'\nabc\n')
3264        self.assertEqual(sr.readline(), b'789\n')
3265        sr.seek(0)
3266        self.assertEqual(sr.readline(), b'1\n')
3267        self.assertEqual(sr.readline(), b'abc\n')
3268        self.assertEqual(sr.readline(), b'789\n')
3269
3270
3271@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3272class LocaleCodecTest(unittest.TestCase):
3273    """
3274    Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3275    """
3276    ENCODING = sys.getfilesystemencoding()
3277    STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3278               "u255:\xff",
3279               "UCS:\xe9\u20ac\U0010ffff",
3280               "surrogates:\uDC80\uDCFF")
3281    BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3282    SURROGATES = "\uDC80\uDCFF"
3283
3284    def encode(self, text, errors="strict"):
3285        return _testcapi.EncodeLocaleEx(text, 0, errors)
3286
3287    def check_encode_strings(self, errors):
3288        for text in self.STRINGS:
3289            with self.subTest(text=text):
3290                try:
3291                    expected = text.encode(self.ENCODING, errors)
3292                except UnicodeEncodeError:
3293                    with self.assertRaises(RuntimeError) as cm:
3294                        self.encode(text, errors)
3295                    errmsg = str(cm.exception)
3296                    self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
3297                else:
3298                    encoded = self.encode(text, errors)
3299                    self.assertEqual(encoded, expected)
3300
3301    def test_encode_strict(self):
3302        self.check_encode_strings("strict")
3303
3304    def test_encode_surrogateescape(self):
3305        self.check_encode_strings("surrogateescape")
3306
3307    def test_encode_surrogatepass(self):
3308        try:
3309            self.encode('', 'surrogatepass')
3310        except ValueError as exc:
3311            if str(exc) == 'unsupported error handler':
3312                self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3313                              f"surrogatepass error handler")
3314            else:
3315                raise
3316
3317        self.check_encode_strings("surrogatepass")
3318
3319    def test_encode_unsupported_error_handler(self):
3320        with self.assertRaises(ValueError) as cm:
3321            self.encode('', 'backslashreplace')
3322        self.assertEqual(str(cm.exception), 'unsupported error handler')
3323
3324    def decode(self, encoded, errors="strict"):
3325        return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3326
3327    def check_decode_strings(self, errors):
3328        is_utf8 = (self.ENCODING == "utf-8")
3329        if is_utf8:
3330            encode_errors = 'surrogateescape'
3331        else:
3332            encode_errors = 'strict'
3333
3334        strings = list(self.BYTES_STRINGS)
3335        for text in self.STRINGS:
3336            try:
3337                encoded = text.encode(self.ENCODING, encode_errors)
3338                if encoded not in strings:
3339                    strings.append(encoded)
3340            except UnicodeEncodeError:
3341                encoded = None
3342
3343            if is_utf8:
3344                encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3345                if encoded2 != encoded:
3346                    strings.append(encoded2)
3347
3348        for encoded in strings:
3349            with self.subTest(encoded=encoded):
3350                try:
3351                    expected = encoded.decode(self.ENCODING, errors)
3352                except UnicodeDecodeError:
3353                    with self.assertRaises(RuntimeError) as cm:
3354                        self.decode(encoded, errors)
3355                    errmsg = str(cm.exception)
3356                    self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3357                else:
3358                    decoded = self.decode(encoded, errors)
3359                    self.assertEqual(decoded, expected)
3360
3361    def test_decode_strict(self):
3362        self.check_decode_strings("strict")
3363
3364    def test_decode_surrogateescape(self):
3365        self.check_decode_strings("surrogateescape")
3366
3367    def test_decode_surrogatepass(self):
3368        try:
3369            self.decode(b'', 'surrogatepass')
3370        except ValueError as exc:
3371            if str(exc) == 'unsupported error handler':
3372                self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3373                              f"surrogatepass error handler")
3374            else:
3375                raise
3376
3377        self.check_decode_strings("surrogatepass")
3378
3379    def test_decode_unsupported_error_handler(self):
3380        with self.assertRaises(ValueError) as cm:
3381            self.decode(b'', 'backslashreplace')
3382        self.assertEqual(str(cm.exception), 'unsupported error handler')
3383
3384
3385class Rot13Test(unittest.TestCase):
3386    """Test the educational ROT-13 codec."""
3387    def test_encode(self):
3388        ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3389        self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3390
3391    def test_decode(self):
3392        plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3393        self.assertEqual(plaintext, 'Et tu, Brute?')
3394
3395    def test_incremental_encode(self):
3396        encoder = codecs.getincrementalencoder('rot-13')()
3397        ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3398        self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3399
3400    def test_incremental_decode(self):
3401        decoder = codecs.getincrementaldecoder('rot-13')()
3402        plaintext = decoder.decode('terra Ares envy tha')
3403        self.assertEqual(plaintext, 'green Nerf rail gun')
3404
3405
3406class Rot13UtilTest(unittest.TestCase):
3407    """Test the ROT-13 codec via rot13 function,
3408    i.e. the user has done something like:
3409    $ echo "Hello World" | python -m encodings.rot_13
3410    """
3411    def test_rot13_func(self):
3412        infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3413        outfile = io.StringIO()
3414        encodings.rot_13.rot13(infile, outfile)
3415        outfile.seek(0)
3416        plain_text = outfile.read()
3417        self.assertEqual(
3418            plain_text,
3419            'To be, or not to be, that is the question')
3420
3421
3422if __name__ == "__main__":
3423    unittest.main()
3424