1import codecs 2import contextlib 3import io 4import locale 5import sys 6import unittest 7import encodings 8from unittest import mock 9 10from test import support 11 12try: 13 import _testcapi 14except ImportError: 15 _testcapi = None 16 17try: 18 import ctypes 19except ImportError: 20 ctypes = None 21 SIZEOF_WCHAR_T = -1 22else: 23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) 24 25def coding_checker(self, coder): 26 def check(input, expect): 27 self.assertEqual(coder(input), (expect, len(input))) 28 return check 29 30# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present 31def is_code_page_present(cp): 32 from ctypes import POINTER, WINFUNCTYPE, WinDLL 33 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD 34 35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term. 36 MAX_DEFAULTCHAR = 2 # single or double byte 37 MAX_PATH = 260 38 class CPINFOEXW(ctypes.Structure): 39 _fields_ = [("MaxCharSize", UINT), 40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR), 41 ("LeadByte", BYTE*MAX_LEADBYTES), 42 ("UnicodeDefaultChar", WCHAR), 43 ("CodePage", UINT), 44 ("CodePageName", WCHAR*MAX_PATH)] 45 46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW)) 47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32"))) 48 info = CPINFOEXW() 49 return GetCPInfoEx(cp, 0, info) 50 51class Queue(object): 52 """ 53 queue: write bytes at one end, read bytes from the other end 54 """ 55 def __init__(self, buffer): 56 self._buffer = buffer 57 58 def write(self, chars): 59 self._buffer += chars 60 61 def read(self, size=-1): 62 if size<0: 63 s = self._buffer 64 self._buffer = self._buffer[:0] # make empty 65 return s 66 else: 67 s = self._buffer[:size] 68 self._buffer = self._buffer[size:] 69 return s 70 71 72class MixInCheckStateHandling: 73 def check_state_handling_decode(self, encoding, u, s): 74 for i in range(len(s)+1): 75 d = codecs.getincrementaldecoder(encoding)() 76 part1 = d.decode(s[:i]) 77 state = d.getstate() 78 self.assertIsInstance(state[1], int) 79 # Check that the condition stated in the documentation for 80 # IncrementalDecoder.getstate() holds 81 if not state[1]: 82 # reset decoder to the default state without anything buffered 83 d.setstate((state[0][:0], 0)) 84 # Feeding the previous input may not produce any output 85 self.assertTrue(not d.decode(state[0])) 86 # The decoder must return to the same state 87 self.assertEqual(state, d.getstate()) 88 # Create a new decoder and set it to the state 89 # we extracted from the old one 90 d = codecs.getincrementaldecoder(encoding)() 91 d.setstate(state) 92 part2 = d.decode(s[i:], True) 93 self.assertEqual(u, part1+part2) 94 95 def check_state_handling_encode(self, encoding, u, s): 96 for i in range(len(u)+1): 97 d = codecs.getincrementalencoder(encoding)() 98 part1 = d.encode(u[:i]) 99 state = d.getstate() 100 d = codecs.getincrementalencoder(encoding)() 101 d.setstate(state) 102 part2 = d.encode(u[i:], True) 103 self.assertEqual(s, part1+part2) 104 105 106class ReadTest(MixInCheckStateHandling): 107 def check_partial(self, input, partialresults): 108 # get a StreamReader for the encoding and feed the bytestring version 109 # of input to the reader byte by byte. Read everything available from 110 # the StreamReader and check that the results equal the appropriate 111 # entries from partialresults. 112 q = Queue(b"") 113 r = codecs.getreader(self.encoding)(q) 114 result = "" 115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 116 q.write(bytes([c])) 117 result += r.read() 118 self.assertEqual(result, partialresult) 119 # check that there's nothing left in the buffers 120 self.assertEqual(r.read(), "") 121 self.assertEqual(r.bytebuffer, b"") 122 123 # do the check again, this time using an incremental decoder 124 d = codecs.getincrementaldecoder(self.encoding)() 125 result = "" 126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 127 result += d.decode(bytes([c])) 128 self.assertEqual(result, partialresult) 129 # check that there's nothing left in the buffers 130 self.assertEqual(d.decode(b"", True), "") 131 self.assertEqual(d.buffer, b"") 132 133 # Check whether the reset method works properly 134 d.reset() 135 result = "" 136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 137 result += d.decode(bytes([c])) 138 self.assertEqual(result, partialresult) 139 # check that there's nothing left in the buffers 140 self.assertEqual(d.decode(b"", True), "") 141 self.assertEqual(d.buffer, b"") 142 143 # check iterdecode() 144 encoded = input.encode(self.encoding) 145 self.assertEqual( 146 input, 147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding)) 148 ) 149 150 def test_readline(self): 151 def getreader(input): 152 stream = io.BytesIO(input.encode(self.encoding)) 153 return codecs.getreader(self.encoding)(stream) 154 155 def readalllines(input, keepends=True, size=None): 156 reader = getreader(input) 157 lines = [] 158 while True: 159 line = reader.readline(size=size, keepends=keepends) 160 if not line: 161 break 162 lines.append(line) 163 return "|".join(lines) 164 165 s = "foo\nbar\r\nbaz\rspam\u2028eggs" 166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs" 167 sexpectednoends = "foo|bar|baz|spam|eggs" 168 self.assertEqual(readalllines(s, True), sexpected) 169 self.assertEqual(readalllines(s, False), sexpectednoends) 170 self.assertEqual(readalllines(s, True, 10), sexpected) 171 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 172 173 lineends = ("\n", "\r\n", "\r", "\u2028") 174 # Test long lines (multiple calls to read() in readline()) 175 vw = [] 176 vwo = [] 177 for (i, lineend) in enumerate(lineends): 178 vw.append((i*200+200)*"\u3042" + lineend) 179 vwo.append((i*200+200)*"\u3042") 180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw)) 181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo)) 182 183 # Test lines where the first read might end with \r, so the 184 # reader has to look ahead whether this is a lone \r or a \r\n 185 for size in range(80): 186 for lineend in lineends: 187 s = 10*(size*"a" + lineend + "xxx\n") 188 reader = getreader(s) 189 for i in range(10): 190 self.assertEqual( 191 reader.readline(keepends=True), 192 size*"a" + lineend, 193 ) 194 self.assertEqual( 195 reader.readline(keepends=True), 196 "xxx\n", 197 ) 198 reader = getreader(s) 199 for i in range(10): 200 self.assertEqual( 201 reader.readline(keepends=False), 202 size*"a", 203 ) 204 self.assertEqual( 205 reader.readline(keepends=False), 206 "xxx", 207 ) 208 209 def test_mixed_readline_and_read(self): 210 lines = ["Humpty Dumpty sat on a wall,\n", 211 "Humpty Dumpty had a great fall.\r\n", 212 "All the king's horses and all the king's men\r", 213 "Couldn't put Humpty together again."] 214 data = ''.join(lines) 215 def getreader(): 216 stream = io.BytesIO(data.encode(self.encoding)) 217 return codecs.getreader(self.encoding)(stream) 218 219 # Issue #8260: Test readline() followed by read() 220 f = getreader() 221 self.assertEqual(f.readline(), lines[0]) 222 self.assertEqual(f.read(), ''.join(lines[1:])) 223 self.assertEqual(f.read(), '') 224 225 # Issue #32110: Test readline() followed by read(n) 226 f = getreader() 227 self.assertEqual(f.readline(), lines[0]) 228 self.assertEqual(f.read(1), lines[1][0]) 229 self.assertEqual(f.read(0), '') 230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100]) 231 232 # Issue #16636: Test readline() followed by readlines() 233 f = getreader() 234 self.assertEqual(f.readline(), lines[0]) 235 self.assertEqual(f.readlines(), lines[1:]) 236 self.assertEqual(f.read(), '') 237 238 # Test read(n) followed by read() 239 f = getreader() 240 self.assertEqual(f.read(size=40, chars=5), data[:5]) 241 self.assertEqual(f.read(), data[5:]) 242 self.assertEqual(f.read(), '') 243 244 # Issue #32110: Test read(n) followed by read(n) 245 f = getreader() 246 self.assertEqual(f.read(size=40, chars=5), data[:5]) 247 self.assertEqual(f.read(1), data[5]) 248 self.assertEqual(f.read(0), '') 249 self.assertEqual(f.read(100), data[6:106]) 250 251 # Issue #12446: Test read(n) followed by readlines() 252 f = getreader() 253 self.assertEqual(f.read(size=40, chars=5), data[:5]) 254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:]) 255 self.assertEqual(f.read(), '') 256 257 def test_bug1175396(self): 258 s = [ 259 '<%!--===================================================\r\n', 260 ' BLOG index page: show recent articles,\r\n', 261 ' today\'s articles, or articles of a specific date.\r\n', 262 '========================================================--%>\r\n', 263 '<%@inputencoding="ISO-8859-1"%>\r\n', 264 '<%@pagetemplate=TEMPLATE.y%>\r\n', 265 '<%@import=import frog.util, frog%>\r\n', 266 '<%@import=import frog.objects%>\r\n', 267 '<%@import=from frog.storageerrors import StorageError%>\r\n', 268 '<%\r\n', 269 '\r\n', 270 'import logging\r\n', 271 'log=logging.getLogger("Snakelets.logger")\r\n', 272 '\r\n', 273 '\r\n', 274 'user=self.SessionCtx.user\r\n', 275 'storageEngine=self.SessionCtx.storageEngine\r\n', 276 '\r\n', 277 '\r\n', 278 'def readArticlesFromDate(date, count=None):\r\n', 279 ' entryids=storageEngine.listBlogEntries(date)\r\n', 280 ' entryids.reverse() # descending\r\n', 281 ' if count:\r\n', 282 ' entryids=entryids[:count]\r\n', 283 ' try:\r\n', 284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 285 ' except StorageError,x:\r\n', 286 ' log.error("Error loading articles: "+str(x))\r\n', 287 ' self.abort("cannot load articles")\r\n', 288 '\r\n', 289 'showdate=None\r\n', 290 '\r\n', 291 'arg=self.Request.getArg()\r\n', 292 'if arg=="today":\r\n', 293 ' #-------------------- TODAY\'S ARTICLES\r\n', 294 ' self.write("<h2>Today\'s articles</h2>")\r\n', 295 ' showdate = frog.util.isodatestr() \r\n', 296 ' entries = readArticlesFromDate(showdate)\r\n', 297 'elif arg=="active":\r\n', 298 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 299 ' self.Yredirect("active.y")\r\n', 300 'elif arg=="login":\r\n', 301 ' #-------------------- LOGIN PAGE redirect\r\n', 302 ' self.Yredirect("login.y")\r\n', 303 'elif arg=="date":\r\n', 304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 305 ' showdate = self.Request.getParameter("date")\r\n', 306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 307 ' entries = readArticlesFromDate(showdate)\r\n', 308 'else:\r\n', 309 ' #-------------------- RECENT ARTICLES\r\n', 310 ' self.write("<h2>Recent articles</h2>")\r\n', 311 ' dates=storageEngine.listBlogEntryDates()\r\n', 312 ' if dates:\r\n', 313 ' entries=[]\r\n', 314 ' SHOWAMOUNT=10\r\n', 315 ' for showdate in dates:\r\n', 316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 317 ' if len(entries)>=SHOWAMOUNT:\r\n', 318 ' break\r\n', 319 ' \r\n', 320 ] 321 stream = io.BytesIO("".join(s).encode(self.encoding)) 322 reader = codecs.getreader(self.encoding)(stream) 323 for (i, line) in enumerate(reader): 324 self.assertEqual(line, s[i]) 325 326 def test_readlinequeue(self): 327 q = Queue(b"") 328 writer = codecs.getwriter(self.encoding)(q) 329 reader = codecs.getreader(self.encoding)(q) 330 331 # No lineends 332 writer.write("foo\r") 333 self.assertEqual(reader.readline(keepends=False), "foo") 334 writer.write("\nbar\r") 335 self.assertEqual(reader.readline(keepends=False), "") 336 self.assertEqual(reader.readline(keepends=False), "bar") 337 writer.write("baz") 338 self.assertEqual(reader.readline(keepends=False), "baz") 339 self.assertEqual(reader.readline(keepends=False), "") 340 341 # Lineends 342 writer.write("foo\r") 343 self.assertEqual(reader.readline(keepends=True), "foo\r") 344 writer.write("\nbar\r") 345 self.assertEqual(reader.readline(keepends=True), "\n") 346 self.assertEqual(reader.readline(keepends=True), "bar\r") 347 writer.write("baz") 348 self.assertEqual(reader.readline(keepends=True), "baz") 349 self.assertEqual(reader.readline(keepends=True), "") 350 writer.write("foo\r\n") 351 self.assertEqual(reader.readline(keepends=True), "foo\r\n") 352 353 def test_bug1098990_a(self): 354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 356 s3 = "next line.\r\n" 357 358 s = (s1+s2+s3).encode(self.encoding) 359 stream = io.BytesIO(s) 360 reader = codecs.getreader(self.encoding)(stream) 361 self.assertEqual(reader.readline(), s1) 362 self.assertEqual(reader.readline(), s2) 363 self.assertEqual(reader.readline(), s3) 364 self.assertEqual(reader.readline(), "") 365 366 def test_bug1098990_b(self): 367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 369 s3 = "stillokay:bbbbxx\r\n" 370 s4 = "broken!!!!badbad\r\n" 371 s5 = "againokay.\r\n" 372 373 s = (s1+s2+s3+s4+s5).encode(self.encoding) 374 stream = io.BytesIO(s) 375 reader = codecs.getreader(self.encoding)(stream) 376 self.assertEqual(reader.readline(), s1) 377 self.assertEqual(reader.readline(), s2) 378 self.assertEqual(reader.readline(), s3) 379 self.assertEqual(reader.readline(), s4) 380 self.assertEqual(reader.readline(), s5) 381 self.assertEqual(reader.readline(), "") 382 383 ill_formed_sequence_replace = "\ufffd" 384 385 def test_lone_surrogates(self): 386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) 387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), 388 "[\\udc80]".encode(self.encoding)) 389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), 390 "[\\udc80]".encode(self.encoding)) 391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), 392 "[�]".encode(self.encoding)) 393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), 394 "[]".encode(self.encoding)) 395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), 396 "[?]".encode(self.encoding)) 397 398 # sequential surrogate characters 399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), 400 "[]".encode(self.encoding)) 401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), 402 "[??]".encode(self.encoding)) 403 404 bom = "".encode(self.encoding) 405 for before, after in [("\U00010fff", "A"), ("[", "]"), 406 ("A", "\U00010fff")]: 407 before_sequence = before.encode(self.encoding)[len(bom):] 408 after_sequence = after.encode(self.encoding)[len(bom):] 409 test_string = before + "\uDC80" + after 410 test_sequence = (bom + before_sequence + 411 self.ill_formed_sequence + after_sequence) 412 self.assertRaises(UnicodeDecodeError, test_sequence.decode, 413 self.encoding) 414 self.assertEqual(test_string.encode(self.encoding, 415 "surrogatepass"), 416 test_sequence) 417 self.assertEqual(test_sequence.decode(self.encoding, 418 "surrogatepass"), 419 test_string) 420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"), 421 before + after) 422 self.assertEqual(test_sequence.decode(self.encoding, "replace"), 423 before + self.ill_formed_sequence_replace + after) 424 backslashreplace = ''.join('\\x%02x' % b 425 for b in self.ill_formed_sequence) 426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), 427 before + backslashreplace + after) 428 429 def test_incremental_surrogatepass(self): 430 # Test incremental decoder for surrogatepass handler: 431 # see issue #24214 432 # High surrogate 433 data = '\uD901'.encode(self.encoding, 'surrogatepass') 434 for i in range(1, len(data)): 435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 436 self.assertEqual(dec.decode(data[:i]), '') 437 self.assertEqual(dec.decode(data[i:], True), '\uD901') 438 # Low surrogate 439 data = '\uDC02'.encode(self.encoding, 'surrogatepass') 440 for i in range(1, len(data)): 441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 442 self.assertEqual(dec.decode(data[:i]), '') 443 self.assertEqual(dec.decode(data[i:]), '\uDC02') 444 445 446class UTF32Test(ReadTest, unittest.TestCase): 447 encoding = "utf-32" 448 if sys.byteorder == 'little': 449 ill_formed_sequence = b"\x80\xdc\x00\x00" 450 else: 451 ill_formed_sequence = b"\x00\x00\xdc\x80" 452 453 spamle = (b'\xff\xfe\x00\x00' 454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 456 spambe = (b'\x00\x00\xfe\xff' 457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 459 460 def test_only_one_bom(self): 461 _,_,reader,writer = codecs.lookup(self.encoding) 462 # encode some stream 463 s = io.BytesIO() 464 f = writer(s) 465 f.write("spam") 466 f.write("spam") 467 d = s.getvalue() 468 # check whether there is exactly one BOM in it 469 self.assertTrue(d == self.spamle or d == self.spambe) 470 # try to read it back 471 s = io.BytesIO(d) 472 f = reader(s) 473 self.assertEqual(f.read(), "spamspam") 474 475 def test_badbom(self): 476 s = io.BytesIO(4*b"\xff") 477 f = codecs.getreader(self.encoding)(s) 478 self.assertRaises(UnicodeError, f.read) 479 480 s = io.BytesIO(8*b"\xff") 481 f = codecs.getreader(self.encoding)(s) 482 self.assertRaises(UnicodeError, f.read) 483 484 def test_partial(self): 485 self.check_partial( 486 "\x00\xff\u0100\uffff\U00010000", 487 [ 488 "", # first byte of BOM read 489 "", # second byte of BOM read 490 "", # third byte of BOM read 491 "", # fourth byte of BOM read => byteorder known 492 "", 493 "", 494 "", 495 "\x00", 496 "\x00", 497 "\x00", 498 "\x00", 499 "\x00\xff", 500 "\x00\xff", 501 "\x00\xff", 502 "\x00\xff", 503 "\x00\xff\u0100", 504 "\x00\xff\u0100", 505 "\x00\xff\u0100", 506 "\x00\xff\u0100", 507 "\x00\xff\u0100\uffff", 508 "\x00\xff\u0100\uffff", 509 "\x00\xff\u0100\uffff", 510 "\x00\xff\u0100\uffff", 511 "\x00\xff\u0100\uffff\U00010000", 512 ] 513 ) 514 515 def test_handlers(self): 516 self.assertEqual(('\ufffd', 1), 517 codecs.utf_32_decode(b'\x01', 'replace', True)) 518 self.assertEqual(('', 1), 519 codecs.utf_32_decode(b'\x01', 'ignore', True)) 520 521 def test_errors(self): 522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 523 b"\xff", "strict", True) 524 525 def test_decoder_state(self): 526 self.check_state_handling_decode(self.encoding, 527 "spamspam", self.spamle) 528 self.check_state_handling_decode(self.encoding, 529 "spamspam", self.spambe) 530 531 def test_issue8941(self): 532 # Issue #8941: insufficient result allocation when decoding into 533 # surrogate pairs on UCS-2 builds. 534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024 535 self.assertEqual('\U00010000' * 1024, 536 codecs.utf_32_decode(encoded_le)[0]) 537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024 538 self.assertEqual('\U00010000' * 1024, 539 codecs.utf_32_decode(encoded_be)[0]) 540 541 542class UTF32LETest(ReadTest, unittest.TestCase): 543 encoding = "utf-32-le" 544 ill_formed_sequence = b"\x80\xdc\x00\x00" 545 546 def test_partial(self): 547 self.check_partial( 548 "\x00\xff\u0100\uffff\U00010000", 549 [ 550 "", 551 "", 552 "", 553 "\x00", 554 "\x00", 555 "\x00", 556 "\x00", 557 "\x00\xff", 558 "\x00\xff", 559 "\x00\xff", 560 "\x00\xff", 561 "\x00\xff\u0100", 562 "\x00\xff\u0100", 563 "\x00\xff\u0100", 564 "\x00\xff\u0100", 565 "\x00\xff\u0100\uffff", 566 "\x00\xff\u0100\uffff", 567 "\x00\xff\u0100\uffff", 568 "\x00\xff\u0100\uffff", 569 "\x00\xff\u0100\uffff\U00010000", 570 ] 571 ) 572 573 def test_simple(self): 574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") 575 576 def test_errors(self): 577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 578 b"\xff", "strict", True) 579 580 def test_issue8941(self): 581 # Issue #8941: insufficient result allocation when decoding into 582 # surrogate pairs on UCS-2 builds. 583 encoded = b'\x00\x00\x01\x00' * 1024 584 self.assertEqual('\U00010000' * 1024, 585 codecs.utf_32_le_decode(encoded)[0]) 586 587 588class UTF32BETest(ReadTest, unittest.TestCase): 589 encoding = "utf-32-be" 590 ill_formed_sequence = b"\x00\x00\xdc\x80" 591 592 def test_partial(self): 593 self.check_partial( 594 "\x00\xff\u0100\uffff\U00010000", 595 [ 596 "", 597 "", 598 "", 599 "\x00", 600 "\x00", 601 "\x00", 602 "\x00", 603 "\x00\xff", 604 "\x00\xff", 605 "\x00\xff", 606 "\x00\xff", 607 "\x00\xff\u0100", 608 "\x00\xff\u0100", 609 "\x00\xff\u0100", 610 "\x00\xff\u0100", 611 "\x00\xff\u0100\uffff", 612 "\x00\xff\u0100\uffff", 613 "\x00\xff\u0100\uffff", 614 "\x00\xff\u0100\uffff", 615 "\x00\xff\u0100\uffff\U00010000", 616 ] 617 ) 618 619 def test_simple(self): 620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") 621 622 def test_errors(self): 623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 624 b"\xff", "strict", True) 625 626 def test_issue8941(self): 627 # Issue #8941: insufficient result allocation when decoding into 628 # surrogate pairs on UCS-2 builds. 629 encoded = b'\x00\x01\x00\x00' * 1024 630 self.assertEqual('\U00010000' * 1024, 631 codecs.utf_32_be_decode(encoded)[0]) 632 633 634class UTF16Test(ReadTest, unittest.TestCase): 635 encoding = "utf-16" 636 if sys.byteorder == 'little': 637 ill_formed_sequence = b"\x80\xdc" 638 else: 639 ill_formed_sequence = b"\xdc\x80" 640 641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 643 644 def test_only_one_bom(self): 645 _,_,reader,writer = codecs.lookup(self.encoding) 646 # encode some stream 647 s = io.BytesIO() 648 f = writer(s) 649 f.write("spam") 650 f.write("spam") 651 d = s.getvalue() 652 # check whether there is exactly one BOM in it 653 self.assertTrue(d == self.spamle or d == self.spambe) 654 # try to read it back 655 s = io.BytesIO(d) 656 f = reader(s) 657 self.assertEqual(f.read(), "spamspam") 658 659 def test_badbom(self): 660 s = io.BytesIO(b"\xff\xff") 661 f = codecs.getreader(self.encoding)(s) 662 self.assertRaises(UnicodeError, f.read) 663 664 s = io.BytesIO(b"\xff\xff\xff\xff") 665 f = codecs.getreader(self.encoding)(s) 666 self.assertRaises(UnicodeError, f.read) 667 668 def test_partial(self): 669 self.check_partial( 670 "\x00\xff\u0100\uffff\U00010000", 671 [ 672 "", # first byte of BOM read 673 "", # second byte of BOM read => byteorder known 674 "", 675 "\x00", 676 "\x00", 677 "\x00\xff", 678 "\x00\xff", 679 "\x00\xff\u0100", 680 "\x00\xff\u0100", 681 "\x00\xff\u0100\uffff", 682 "\x00\xff\u0100\uffff", 683 "\x00\xff\u0100\uffff", 684 "\x00\xff\u0100\uffff", 685 "\x00\xff\u0100\uffff\U00010000", 686 ] 687 ) 688 689 def test_handlers(self): 690 self.assertEqual(('\ufffd', 1), 691 codecs.utf_16_decode(b'\x01', 'replace', True)) 692 self.assertEqual(('', 1), 693 codecs.utf_16_decode(b'\x01', 'ignore', True)) 694 695 def test_errors(self): 696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, 697 b"\xff", "strict", True) 698 699 def test_decoder_state(self): 700 self.check_state_handling_decode(self.encoding, 701 "spamspam", self.spamle) 702 self.check_state_handling_decode(self.encoding, 703 "spamspam", self.spambe) 704 705 def test_bug691291(self): 706 # Files are always opened in binary mode, even if no binary mode was 707 # specified. This means that no automatic conversion of '\n' is done 708 # on reading and writing. 709 s1 = 'Hello\r\nworld\r\n' 710 711 s = s1.encode(self.encoding) 712 self.addCleanup(support.unlink, support.TESTFN) 713 with open(support.TESTFN, 'wb') as fp: 714 fp.write(s) 715 with support.check_warnings(('', DeprecationWarning)): 716 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding) 717 with reader: 718 self.assertEqual(reader.read(), s1) 719 720class UTF16LETest(ReadTest, unittest.TestCase): 721 encoding = "utf-16-le" 722 ill_formed_sequence = b"\x80\xdc" 723 724 def test_partial(self): 725 self.check_partial( 726 "\x00\xff\u0100\uffff\U00010000", 727 [ 728 "", 729 "\x00", 730 "\x00", 731 "\x00\xff", 732 "\x00\xff", 733 "\x00\xff\u0100", 734 "\x00\xff\u0100", 735 "\x00\xff\u0100\uffff", 736 "\x00\xff\u0100\uffff", 737 "\x00\xff\u0100\uffff", 738 "\x00\xff\u0100\uffff", 739 "\x00\xff\u0100\uffff\U00010000", 740 ] 741 ) 742 743 def test_errors(self): 744 tests = [ 745 (b'\xff', '\ufffd'), 746 (b'A\x00Z', 'A\ufffd'), 747 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'), 748 (b'\x00\xd8', '\ufffd'), 749 (b'\x00\xd8A', '\ufffd'), 750 (b'\x00\xd8A\x00', '\ufffdA'), 751 (b'\x00\xdcA\x00', '\ufffdA'), 752 ] 753 for raw, expected in tests: 754 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, 755 raw, 'strict', True) 756 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 757 758 def test_nonbmp(self): 759 self.assertEqual("\U00010203".encode(self.encoding), 760 b'\x00\xd8\x03\xde') 761 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), 762 "\U00010203") 763 764class UTF16BETest(ReadTest, unittest.TestCase): 765 encoding = "utf-16-be" 766 ill_formed_sequence = b"\xdc\x80" 767 768 def test_partial(self): 769 self.check_partial( 770 "\x00\xff\u0100\uffff\U00010000", 771 [ 772 "", 773 "\x00", 774 "\x00", 775 "\x00\xff", 776 "\x00\xff", 777 "\x00\xff\u0100", 778 "\x00\xff\u0100", 779 "\x00\xff\u0100\uffff", 780 "\x00\xff\u0100\uffff", 781 "\x00\xff\u0100\uffff", 782 "\x00\xff\u0100\uffff", 783 "\x00\xff\u0100\uffff\U00010000", 784 ] 785 ) 786 787 def test_errors(self): 788 tests = [ 789 (b'\xff', '\ufffd'), 790 (b'\x00A\xff', 'A\ufffd'), 791 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'), 792 (b'\xd8\x00', '\ufffd'), 793 (b'\xd8\x00\xdc', '\ufffd'), 794 (b'\xd8\x00\x00A', '\ufffdA'), 795 (b'\xdc\x00\x00A', '\ufffdA'), 796 ] 797 for raw, expected in tests: 798 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, 799 raw, 'strict', True) 800 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 801 802 def test_nonbmp(self): 803 self.assertEqual("\U00010203".encode(self.encoding), 804 b'\xd8\x00\xde\x03') 805 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 806 "\U00010203") 807 808class UTF8Test(ReadTest, unittest.TestCase): 809 encoding = "utf-8" 810 ill_formed_sequence = b"\xed\xb2\x80" 811 ill_formed_sequence_replace = "\ufffd" * 3 812 BOM = b'' 813 814 def test_partial(self): 815 self.check_partial( 816 "\x00\xff\u07ff\u0800\uffff\U00010000", 817 [ 818 "\x00", 819 "\x00", 820 "\x00\xff", 821 "\x00\xff", 822 "\x00\xff\u07ff", 823 "\x00\xff\u07ff", 824 "\x00\xff\u07ff", 825 "\x00\xff\u07ff\u0800", 826 "\x00\xff\u07ff\u0800", 827 "\x00\xff\u07ff\u0800", 828 "\x00\xff\u07ff\u0800\uffff", 829 "\x00\xff\u07ff\u0800\uffff", 830 "\x00\xff\u07ff\u0800\uffff", 831 "\x00\xff\u07ff\u0800\uffff", 832 "\x00\xff\u07ff\u0800\uffff\U00010000", 833 ] 834 ) 835 836 def test_decoder_state(self): 837 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 838 self.check_state_handling_decode(self.encoding, 839 u, u.encode(self.encoding)) 840 841 def test_decode_error(self): 842 for data, error_handler, expected in ( 843 (b'[\x80\xff]', 'ignore', '[]'), 844 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 845 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 846 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 847 ): 848 with self.subTest(data=data, error_handler=error_handler, 849 expected=expected): 850 self.assertEqual(data.decode(self.encoding, error_handler), 851 expected) 852 853 def test_lone_surrogates(self): 854 super().test_lone_surrogates() 855 # not sure if this is making sense for 856 # UTF-16 and UTF-32 857 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), 858 self.BOM + b'[\x80]') 859 860 with self.assertRaises(UnicodeEncodeError) as cm: 861 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") 862 exc = cm.exception 863 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') 864 865 def test_surrogatepass_handler(self): 866 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), 867 self.BOM + b"abc\xed\xa0\x80def") 868 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), 869 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") 870 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), 871 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') 872 873 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), 874 "abc\ud800def") 875 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), 876 "\U00010fff\uD800") 877 878 self.assertTrue(codecs.lookup_error("surrogatepass")) 879 with self.assertRaises(UnicodeDecodeError): 880 b"abc\xed\xa0".decode(self.encoding, "surrogatepass") 881 with self.assertRaises(UnicodeDecodeError): 882 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") 883 884 def test_incremental_errors(self): 885 # Test that the incremental decoder can fail with final=False. 886 # See issue #24214 887 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF'] 888 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF', 889 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80', 890 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'): 891 for suffix in b'\x7F', b'\xC0': 892 cases.append(prefix + suffix) 893 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80', 894 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90')) 895 896 for data in cases: 897 with self.subTest(data=data): 898 dec = codecs.getincrementaldecoder(self.encoding)() 899 self.assertRaises(UnicodeDecodeError, dec.decode, data) 900 901 902class UTF7Test(ReadTest, unittest.TestCase): 903 encoding = "utf-7" 904 905 def test_ascii(self): 906 # Set D (directly encoded characters) 907 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 908 'abcdefghijklmnopqrstuvwxyz' 909 '0123456789' 910 '\'(),-./:?') 911 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) 912 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) 913 # Set O (optional direct characters) 914 set_o = ' !"#$%&*;<=>@[]^_`{|}' 915 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) 916 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) 917 # + 918 self.assertEqual('a+b'.encode(self.encoding), b'a+-b') 919 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') 920 # White spaces 921 ws = ' \t\n\r' 922 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) 923 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) 924 # Other ASCII characters 925 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - 926 set(set_d + set_o + '+' + ws))) 927 self.assertEqual(other_ascii.encode(self.encoding), 928 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' 929 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') 930 931 def test_partial(self): 932 self.check_partial( 933 'a+-b\x00c\x80d\u0100e\U00010000f', 934 [ 935 'a', 936 'a', 937 'a+', 938 'a+-', 939 'a+-b', 940 'a+-b', 941 'a+-b', 942 'a+-b', 943 'a+-b', 944 'a+-b\x00', 945 'a+-b\x00c', 946 'a+-b\x00c', 947 'a+-b\x00c', 948 'a+-b\x00c', 949 'a+-b\x00c', 950 'a+-b\x00c\x80', 951 'a+-b\x00c\x80d', 952 'a+-b\x00c\x80d', 953 'a+-b\x00c\x80d', 954 'a+-b\x00c\x80d', 955 'a+-b\x00c\x80d', 956 'a+-b\x00c\x80d\u0100', 957 'a+-b\x00c\x80d\u0100e', 958 'a+-b\x00c\x80d\u0100e', 959 'a+-b\x00c\x80d\u0100e', 960 'a+-b\x00c\x80d\u0100e', 961 'a+-b\x00c\x80d\u0100e', 962 'a+-b\x00c\x80d\u0100e', 963 'a+-b\x00c\x80d\u0100e', 964 'a+-b\x00c\x80d\u0100e', 965 'a+-b\x00c\x80d\u0100e\U00010000', 966 'a+-b\x00c\x80d\u0100e\U00010000f', 967 ] 968 ) 969 970 def test_errors(self): 971 tests = [ 972 (b'\xffb', '\ufffdb'), 973 (b'a\xffb', 'a\ufffdb'), 974 (b'a\xff\xffb', 'a\ufffd\ufffdb'), 975 (b'a+IK', 'a\ufffd'), 976 (b'a+IK-b', 'a\ufffdb'), 977 (b'a+IK,b', 'a\ufffdb'), 978 (b'a+IKx', 'a\u20ac\ufffd'), 979 (b'a+IKx-b', 'a\u20ac\ufffdb'), 980 (b'a+IKwgr', 'a\u20ac\ufffd'), 981 (b'a+IKwgr-b', 'a\u20ac\ufffdb'), 982 (b'a+IKwgr,', 'a\u20ac\ufffd'), 983 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 984 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 985 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 986 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 987 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 988 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 989 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 990 (b'a+IKw-b\xff', 'a\u20acb\ufffd'), 991 (b'a+IKw\xffb', 'a\u20ac\ufffdb'), 992 (b'a+@b', 'a\ufffdb'), 993 ] 994 for raw, expected in tests: 995 with self.subTest(raw=raw): 996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 997 raw, 'strict', True) 998 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 999 1000 def test_nonbmp(self): 1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 1004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') 1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') 1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') 1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') 1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), 1009 b'+IKwgrNgB3KA-') 1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), 1011 '\u20ac\u20ac\U000104A0') 1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), 1013 '\u20ac\u20ac\U000104A0') 1014 1015 def test_lone_surrogates(self): 1016 tests = [ 1017 (b'a+2AE-b', 'a\ud801b'), 1018 (b'a+2AE\xffb', 'a\ufffdb'), 1019 (b'a+2AE', 'a\ufffd'), 1020 (b'a+2AEA-b', 'a\ufffdb'), 1021 (b'a+2AH-b', 'a\ufffdb'), 1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), 1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), 1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), 1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), 1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), 1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), 1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), 1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), 1030 ] 1031 for raw, expected in tests: 1032 with self.subTest(raw=raw): 1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1034 1035 1036class UTF16ExTest(unittest.TestCase): 1037 1038 def test_errors(self): 1039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 1040 1041 def test_bad_args(self): 1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 1043 1044class ReadBufferTest(unittest.TestCase): 1045 1046 def test_array(self): 1047 import array 1048 self.assertEqual( 1049 codecs.readbuffer_encode(array.array("b", b"spam")), 1050 (b"spam", 4) 1051 ) 1052 1053 def test_empty(self): 1054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 1055 1056 def test_bad_args(self): 1057 self.assertRaises(TypeError, codecs.readbuffer_encode) 1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 1059 1060class UTF8SigTest(UTF8Test, unittest.TestCase): 1061 encoding = "utf-8-sig" 1062 BOM = codecs.BOM_UTF8 1063 1064 def test_partial(self): 1065 self.check_partial( 1066 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1067 [ 1068 "", 1069 "", 1070 "", # First BOM has been read and skipped 1071 "", 1072 "", 1073 "\ufeff", # Second BOM has been read and emitted 1074 "\ufeff\x00", # "\x00" read and emitted 1075 "\ufeff\x00", # First byte of encoded "\xff" read 1076 "\ufeff\x00\xff", # Second byte of encoded "\xff" read 1077 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read 1078 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read 1079 "\ufeff\x00\xff\u07ff", 1080 "\ufeff\x00\xff\u07ff", 1081 "\ufeff\x00\xff\u07ff\u0800", 1082 "\ufeff\x00\xff\u07ff\u0800", 1083 "\ufeff\x00\xff\u07ff\u0800", 1084 "\ufeff\x00\xff\u07ff\u0800\uffff", 1085 "\ufeff\x00\xff\u07ff\u0800\uffff", 1086 "\ufeff\x00\xff\u07ff\u0800\uffff", 1087 "\ufeff\x00\xff\u07ff\u0800\uffff", 1088 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1089 ] 1090 ) 1091 1092 def test_bug1601501(self): 1093 # SF bug #1601501: check that the codec works with a buffer 1094 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "") 1095 1096 def test_bom(self): 1097 d = codecs.getincrementaldecoder("utf-8-sig")() 1098 s = "spam" 1099 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 1100 1101 def test_stream_bom(self): 1102 unistring = "ABC\u00A1\u2200XYZ" 1103 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1104 1105 reader = codecs.getreader("utf-8-sig") 1106 for sizehint in [None] + list(range(1, 11)) + \ 1107 [64, 128, 256, 512, 1024]: 1108 istream = reader(io.BytesIO(bytestring)) 1109 ostream = io.StringIO() 1110 while 1: 1111 if sizehint is not None: 1112 data = istream.read(sizehint) 1113 else: 1114 data = istream.read() 1115 1116 if not data: 1117 break 1118 ostream.write(data) 1119 1120 got = ostream.getvalue() 1121 self.assertEqual(got, unistring) 1122 1123 def test_stream_bare(self): 1124 unistring = "ABC\u00A1\u2200XYZ" 1125 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1126 1127 reader = codecs.getreader("utf-8-sig") 1128 for sizehint in [None] + list(range(1, 11)) + \ 1129 [64, 128, 256, 512, 1024]: 1130 istream = reader(io.BytesIO(bytestring)) 1131 ostream = io.StringIO() 1132 while 1: 1133 if sizehint is not None: 1134 data = istream.read(sizehint) 1135 else: 1136 data = istream.read() 1137 1138 if not data: 1139 break 1140 ostream.write(data) 1141 1142 got = ostream.getvalue() 1143 self.assertEqual(got, unistring) 1144 1145 1146class EscapeDecodeTest(unittest.TestCase): 1147 def test_empty(self): 1148 self.assertEqual(codecs.escape_decode(b""), (b"", 0)) 1149 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0)) 1150 1151 def test_raw(self): 1152 decode = codecs.escape_decode 1153 for b in range(256): 1154 b = bytes([b]) 1155 if b != b'\\': 1156 self.assertEqual(decode(b + b'0'), (b + b'0', 2)) 1157 1158 def test_escape(self): 1159 decode = codecs.escape_decode 1160 check = coding_checker(self, decode) 1161 check(b"[\\\n]", b"[]") 1162 check(br'[\"]', b'["]') 1163 check(br"[\']", b"[']") 1164 check(br"[\\]", b"[\\]") 1165 check(br"[\a]", b"[\x07]") 1166 check(br"[\b]", b"[\x08]") 1167 check(br"[\t]", b"[\x09]") 1168 check(br"[\n]", b"[\x0a]") 1169 check(br"[\v]", b"[\x0b]") 1170 check(br"[\f]", b"[\x0c]") 1171 check(br"[\r]", b"[\x0d]") 1172 check(br"[\7]", b"[\x07]") 1173 check(br"[\78]", b"[\x078]") 1174 check(br"[\41]", b"[!]") 1175 check(br"[\418]", b"[!8]") 1176 check(br"[\101]", b"[A]") 1177 check(br"[\1010]", b"[A0]") 1178 check(br"[\501]", b"[A]") 1179 check(br"[\x41]", b"[A]") 1180 check(br"[\x410]", b"[A0]") 1181 for i in range(97, 123): 1182 b = bytes([i]) 1183 if b not in b'abfnrtvx': 1184 with self.assertWarns(DeprecationWarning): 1185 check(b"\\" + b, b"\\" + b) 1186 with self.assertWarns(DeprecationWarning): 1187 check(b"\\" + b.upper(), b"\\" + b.upper()) 1188 with self.assertWarns(DeprecationWarning): 1189 check(br"\8", b"\\8") 1190 with self.assertWarns(DeprecationWarning): 1191 check(br"\9", b"\\9") 1192 with self.assertWarns(DeprecationWarning): 1193 check(b"\\\xfa", b"\\\xfa") 1194 1195 def test_errors(self): 1196 decode = codecs.escape_decode 1197 self.assertRaises(ValueError, decode, br"\x") 1198 self.assertRaises(ValueError, decode, br"[\x]") 1199 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 1200 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 1201 self.assertRaises(ValueError, decode, br"\x0") 1202 self.assertRaises(ValueError, decode, br"[\x0]") 1203 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 1204 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 1205 1206 1207# From RFC 3492 1208punycode_testcases = [ 1209 # A Arabic (Egyptian): 1210 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1211 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 1212 b"egbpdaj6bu4bxfgehfvwxn"), 1213 # B Chinese (simplified): 1214 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 1215 b"ihqwcrb4cv8a8dqg056pqjye"), 1216 # C Chinese (traditional): 1217 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 1218 b"ihqwctvzc91f659drss3x8bo0yb"), 1219 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 1220 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 1221 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 1222 "\u0065\u0073\u006B\u0079", 1223 b"Proprostnemluvesky-uyb24dma41a"), 1224 # E Hebrew: 1225 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 1226 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 1227 "\u05D1\u05E8\u05D9\u05EA", 1228 b"4dbcagdahymbxekheh6e0a7fei0b"), 1229 # F Hindi (Devanagari): 1230 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 1231 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 1232 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 1233 "\u0939\u0948\u0902", 1234 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 1235 1236 #(G) Japanese (kanji and hiragana): 1237 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 1238 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 1239 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 1240 1241 # (H) Korean (Hangul syllables): 1242 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 1243 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 1244 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 1245 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 1246 b"psd879ccm6fea98c"), 1247 1248 # (I) Russian (Cyrillic): 1249 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 1250 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 1251 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 1252 "\u0438", 1253 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 1254 1255 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 1256 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 1257 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 1258 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 1259 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 1260 "\u0061\u00F1\u006F\u006C", 1261 b"PorqunopuedensimplementehablarenEspaol-fmd56a"), 1262 1263 # (K) Vietnamese: 1264 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 1265 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 1266 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 1267 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 1268 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 1269 "\u0056\u0069\u1EC7\u0074", 1270 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 1271 1272 #(L) 3<nen>B<gumi><kinpachi><sensei> 1273 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 1274 b"3B-ww4c5e180e575a65lsy2b"), 1275 1276 # (M) <amuro><namie>-with-SUPER-MONKEYS 1277 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 1278 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 1279 "\u004F\u004E\u004B\u0045\u0059\u0053", 1280 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 1281 1282 # (N) Hello-Another-Way-<sorezore><no><basho> 1283 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 1284 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 1285 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 1286 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"), 1287 1288 # (O) <hitotsu><yane><no><shita>2 1289 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 1290 b"2-u9tlzr9756bt3uc0v"), 1291 1292 # (P) Maji<de>Koi<suru>5<byou><mae> 1293 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 1294 "\u308B\u0035\u79D2\u524D", 1295 b"MajiKoi5-783gue6qz075azm5e"), 1296 1297 # (Q) <pafii>de<runba> 1298 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 1299 b"de-jg4avhby1noc0d"), 1300 1301 # (R) <sono><supiido><de> 1302 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 1303 b"d9juau41awczczp"), 1304 1305 # (S) -> $1.00 <- 1306 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 1307 "\u003C\u002D", 1308 b"-> $1.00 <--") 1309 ] 1310 1311for i in punycode_testcases: 1312 if len(i)!=2: 1313 print(repr(i)) 1314 1315 1316class PunycodeTest(unittest.TestCase): 1317 def test_encode(self): 1318 for uni, puny in punycode_testcases: 1319 # Need to convert both strings to lower case, since 1320 # some of the extended encodings use upper case, but our 1321 # code produces only lower case. Converting just puny to 1322 # lower is also insufficient, since some of the input characters 1323 # are upper case. 1324 self.assertEqual( 1325 str(uni.encode("punycode"), "ascii").lower(), 1326 str(puny, "ascii").lower() 1327 ) 1328 1329 def test_decode(self): 1330 for uni, puny in punycode_testcases: 1331 self.assertEqual(uni, puny.decode("punycode")) 1332 puny = puny.decode("ascii").encode("ascii") 1333 self.assertEqual(uni, puny.decode("punycode")) 1334 1335 def test_decode_invalid(self): 1336 testcases = [ 1337 (b"xn--w&", "strict", UnicodeError()), 1338 (b"xn--w&", "ignore", "xn-"), 1339 ] 1340 for puny, errors, expected in testcases: 1341 with self.subTest(puny=puny, errors=errors): 1342 if isinstance(expected, Exception): 1343 self.assertRaises(UnicodeError, puny.decode, "punycode", errors) 1344 else: 1345 self.assertEqual(puny.decode("punycode", errors), expected) 1346 1347 1348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 1349nameprep_tests = [ 1350 # 3.1 Map to nothing. 1351 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 1352 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 1353 b'\xb8\x8f\xef\xbb\xbf', 1354 b'foobarbaz'), 1355 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 1356 (b'CAFE', 1357 b'cafe'), 1358 # 3.3 Case folding 8bit U+00DF (german sharp s). 1359 # The original test case is bogus; it says \xc3\xdf 1360 (b'\xc3\x9f', 1361 b'ss'), 1362 # 3.4 Case folding U+0130 (turkish capital I with dot). 1363 (b'\xc4\xb0', 1364 b'i\xcc\x87'), 1365 # 3.5 Case folding multibyte U+0143 U+037A. 1366 (b'\xc5\x83\xcd\xba', 1367 b'\xc5\x84 \xce\xb9'), 1368 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 1369 # XXX: skip this as it fails in UCS-2 mode 1370 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 1371 # 'telc\xe2\x88\x95kg\xcf\x83'), 1372 (None, None), 1373 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1374 (b'j\xcc\x8c\xc2\xa0\xc2\xaa', 1375 b'\xc7\xb0 a'), 1376 # 3.8 Case folding U+1FB7 and normalization. 1377 (b'\xe1\xbe\xb7', 1378 b'\xe1\xbe\xb6\xce\xb9'), 1379 # 3.9 Self-reverting case folding U+01F0 and normalization. 1380 # The original test case is bogus, it says `\xc7\xf0' 1381 (b'\xc7\xb0', 1382 b'\xc7\xb0'), 1383 # 3.10 Self-reverting case folding U+0390 and normalization. 1384 (b'\xce\x90', 1385 b'\xce\x90'), 1386 # 3.11 Self-reverting case folding U+03B0 and normalization. 1387 (b'\xce\xb0', 1388 b'\xce\xb0'), 1389 # 3.12 Self-reverting case folding U+1E96 and normalization. 1390 (b'\xe1\xba\x96', 1391 b'\xe1\xba\x96'), 1392 # 3.13 Self-reverting case folding U+1F56 and normalization. 1393 (b'\xe1\xbd\x96', 1394 b'\xe1\xbd\x96'), 1395 # 3.14 ASCII space character U+0020. 1396 (b' ', 1397 b' '), 1398 # 3.15 Non-ASCII 8bit space character U+00A0. 1399 (b'\xc2\xa0', 1400 b' '), 1401 # 3.16 Non-ASCII multibyte space character U+1680. 1402 (b'\xe1\x9a\x80', 1403 None), 1404 # 3.17 Non-ASCII multibyte space character U+2000. 1405 (b'\xe2\x80\x80', 1406 b' '), 1407 # 3.18 Zero Width Space U+200b. 1408 (b'\xe2\x80\x8b', 1409 b''), 1410 # 3.19 Non-ASCII multibyte space character U+3000. 1411 (b'\xe3\x80\x80', 1412 b' '), 1413 # 3.20 ASCII control characters U+0010 U+007F. 1414 (b'\x10\x7f', 1415 b'\x10\x7f'), 1416 # 3.21 Non-ASCII 8bit control character U+0085. 1417 (b'\xc2\x85', 1418 None), 1419 # 3.22 Non-ASCII multibyte control character U+180E. 1420 (b'\xe1\xa0\x8e', 1421 None), 1422 # 3.23 Zero Width No-Break Space U+FEFF. 1423 (b'\xef\xbb\xbf', 1424 b''), 1425 # 3.24 Non-ASCII control character U+1D175. 1426 (b'\xf0\x9d\x85\xb5', 1427 None), 1428 # 3.25 Plane 0 private use character U+F123. 1429 (b'\xef\x84\xa3', 1430 None), 1431 # 3.26 Plane 15 private use character U+F1234. 1432 (b'\xf3\xb1\x88\xb4', 1433 None), 1434 # 3.27 Plane 16 private use character U+10F234. 1435 (b'\xf4\x8f\x88\xb4', 1436 None), 1437 # 3.28 Non-character code point U+8FFFE. 1438 (b'\xf2\x8f\xbf\xbe', 1439 None), 1440 # 3.29 Non-character code point U+10FFFF. 1441 (b'\xf4\x8f\xbf\xbf', 1442 None), 1443 # 3.30 Surrogate code U+DF42. 1444 (b'\xed\xbd\x82', 1445 None), 1446 # 3.31 Non-plain text character U+FFFD. 1447 (b'\xef\xbf\xbd', 1448 None), 1449 # 3.32 Ideographic description character U+2FF5. 1450 (b'\xe2\xbf\xb5', 1451 None), 1452 # 3.33 Display property character U+0341. 1453 (b'\xcd\x81', 1454 b'\xcc\x81'), 1455 # 3.34 Left-to-right mark U+200E. 1456 (b'\xe2\x80\x8e', 1457 None), 1458 # 3.35 Deprecated U+202A. 1459 (b'\xe2\x80\xaa', 1460 None), 1461 # 3.36 Language tagging character U+E0001. 1462 (b'\xf3\xa0\x80\x81', 1463 None), 1464 # 3.37 Language tagging character U+E0042. 1465 (b'\xf3\xa0\x81\x82', 1466 None), 1467 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1468 (b'foo\xd6\xbebar', 1469 None), 1470 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1471 (b'foo\xef\xb5\x90bar', 1472 None), 1473 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1474 (b'foo\xef\xb9\xb6bar', 1475 b'foo \xd9\x8ebar'), 1476 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1477 (b'\xd8\xa71', 1478 None), 1479 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1480 (b'\xd8\xa71\xd8\xa8', 1481 b'\xd8\xa71\xd8\xa8'), 1482 # 3.43 Unassigned code point U+E0002. 1483 # Skip this test as we allow unassigned 1484 #(b'\xf3\xa0\x80\x82', 1485 # None), 1486 (None, None), 1487 # 3.44 Larger test (shrinking). 1488 # Original test case reads \xc3\xdf 1489 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1490 b'\xaa\xce\xb0\xe2\x80\x80', 1491 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1492 # 3.45 Larger test (expanding). 1493 # Original test case reads \xc3\x9f 1494 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1495 b'\x80', 1496 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1497 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1498 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1499 ] 1500 1501 1502class NameprepTest(unittest.TestCase): 1503 def test_nameprep(self): 1504 from encodings.idna import nameprep 1505 for pos, (orig, prepped) in enumerate(nameprep_tests): 1506 if orig is None: 1507 # Skipped 1508 continue 1509 # The Unicode strings are given in UTF-8 1510 orig = str(orig, "utf-8", "surrogatepass") 1511 if prepped is None: 1512 # Input contains prohibited characters 1513 self.assertRaises(UnicodeError, nameprep, orig) 1514 else: 1515 prepped = str(prepped, "utf-8", "surrogatepass") 1516 try: 1517 self.assertEqual(nameprep(orig), prepped) 1518 except Exception as e: 1519 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1520 1521 1522class IDNACodecTest(unittest.TestCase): 1523 def test_builtin_decode(self): 1524 self.assertEqual(str(b"python.org", "idna"), "python.org") 1525 self.assertEqual(str(b"python.org.", "idna"), "python.org.") 1526 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org") 1527 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.") 1528 1529 def test_builtin_encode(self): 1530 self.assertEqual("python.org".encode("idna"), b"python.org") 1531 self.assertEqual("python.org.".encode("idna"), b"python.org.") 1532 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") 1533 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") 1534 1535 def test_builtin_decode_length_limit(self): 1536 with self.assertRaisesRegex(UnicodeError, "too long"): 1537 (b"xn--016c"+b"a"*1100).decode("idna") 1538 with self.assertRaisesRegex(UnicodeError, "too long"): 1539 (b"xn--016c"+b"a"*70).decode("idna") 1540 1541 def test_stream(self): 1542 r = codecs.getreader("idna")(io.BytesIO(b"abc")) 1543 r.read(3) 1544 self.assertEqual(r.read(), "") 1545 1546 def test_incremental_decode(self): 1547 self.assertEqual( 1548 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")), 1549 "python.org" 1550 ) 1551 self.assertEqual( 1552 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")), 1553 "python.org." 1554 ) 1555 self.assertEqual( 1556 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1557 "pyth\xf6n.org." 1558 ) 1559 self.assertEqual( 1560 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1561 "pyth\xf6n.org." 1562 ) 1563 1564 decoder = codecs.getincrementaldecoder("idna")() 1565 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1566 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1567 self.assertEqual(decoder.decode(b"rg"), "") 1568 self.assertEqual(decoder.decode(b"", True), "org") 1569 1570 decoder.reset() 1571 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1572 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1573 self.assertEqual(decoder.decode(b"rg."), "org.") 1574 self.assertEqual(decoder.decode(b"", True), "") 1575 1576 def test_incremental_encode(self): 1577 self.assertEqual( 1578 b"".join(codecs.iterencode("python.org", "idna")), 1579 b"python.org" 1580 ) 1581 self.assertEqual( 1582 b"".join(codecs.iterencode("python.org.", "idna")), 1583 b"python.org." 1584 ) 1585 self.assertEqual( 1586 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1587 b"xn--pythn-mua.org." 1588 ) 1589 self.assertEqual( 1590 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1591 b"xn--pythn-mua.org." 1592 ) 1593 1594 encoder = codecs.getincrementalencoder("idna")() 1595 self.assertEqual(encoder.encode("\xe4x"), b"") 1596 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.") 1597 self.assertEqual(encoder.encode("", True), b"org") 1598 1599 encoder.reset() 1600 self.assertEqual(encoder.encode("\xe4x"), b"") 1601 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") 1602 self.assertEqual(encoder.encode("", True), b"") 1603 1604 def test_errors(self): 1605 """Only supports "strict" error handler""" 1606 "python.org".encode("idna", "strict") 1607 b"python.org".decode("idna", "strict") 1608 for errors in ("ignore", "replace", "backslashreplace", 1609 "surrogateescape"): 1610 self.assertRaises(Exception, "python.org".encode, "idna", errors) 1611 self.assertRaises(Exception, 1612 b"python.org".decode, "idna", errors) 1613 1614 1615class CodecsModuleTest(unittest.TestCase): 1616 1617 def test_decode(self): 1618 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'), 1619 '\xe4\xf6\xfc') 1620 self.assertRaises(TypeError, codecs.decode) 1621 self.assertEqual(codecs.decode(b'abc'), 'abc') 1622 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') 1623 1624 # test keywords 1625 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'), 1626 '\xe4\xf6\xfc') 1627 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'), 1628 '[]') 1629 1630 def test_encode(self): 1631 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'), 1632 b'\xe4\xf6\xfc') 1633 self.assertRaises(TypeError, codecs.encode) 1634 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1635 self.assertEqual(codecs.encode('abc'), b'abc') 1636 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii') 1637 1638 # test keywords 1639 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'), 1640 b'\xe4\xf6\xfc') 1641 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'), 1642 b'[]') 1643 1644 def test_register(self): 1645 self.assertRaises(TypeError, codecs.register) 1646 self.assertRaises(TypeError, codecs.register, 42) 1647 1648 def test_lookup(self): 1649 self.assertRaises(TypeError, codecs.lookup) 1650 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1651 self.assertRaises(LookupError, codecs.lookup, " ") 1652 1653 def test_getencoder(self): 1654 self.assertRaises(TypeError, codecs.getencoder) 1655 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1656 1657 def test_getdecoder(self): 1658 self.assertRaises(TypeError, codecs.getdecoder) 1659 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1660 1661 def test_getreader(self): 1662 self.assertRaises(TypeError, codecs.getreader) 1663 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1664 1665 def test_getwriter(self): 1666 self.assertRaises(TypeError, codecs.getwriter) 1667 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1668 1669 def test_lookup_issue1813(self): 1670 # Issue #1813: under Turkish locales, lookup of some codecs failed 1671 # because 'I' is lowercased as "ı" (dotless i) 1672 oldlocale = locale.setlocale(locale.LC_CTYPE) 1673 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1674 try: 1675 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1676 except locale.Error: 1677 # Unsupported locale on this system 1678 self.skipTest('test needs Turkish locale') 1679 c = codecs.lookup('ASCII') 1680 self.assertEqual(c.name, 'ascii') 1681 1682 def test_all(self): 1683 api = ( 1684 "encode", "decode", 1685 "register", "CodecInfo", "Codec", "IncrementalEncoder", 1686 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", 1687 "getencoder", "getdecoder", "getincrementalencoder", 1688 "getincrementaldecoder", "getreader", "getwriter", 1689 "register_error", "lookup_error", 1690 "strict_errors", "replace_errors", "ignore_errors", 1691 "xmlcharrefreplace_errors", "backslashreplace_errors", 1692 "namereplace_errors", 1693 "open", "EncodedFile", 1694 "iterencode", "iterdecode", 1695 "BOM", "BOM_BE", "BOM_LE", 1696 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", 1697 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", 1698 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented 1699 "StreamReaderWriter", "StreamRecoder", 1700 ) 1701 self.assertCountEqual(api, codecs.__all__) 1702 for api in codecs.__all__: 1703 getattr(codecs, api) 1704 1705 def test_open(self): 1706 self.addCleanup(support.unlink, support.TESTFN) 1707 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'): 1708 with self.subTest(mode), \ 1709 codecs.open(support.TESTFN, mode, 'ascii') as file: 1710 self.assertIsInstance(file, codecs.StreamReaderWriter) 1711 1712 def test_undefined(self): 1713 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') 1714 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') 1715 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') 1716 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') 1717 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): 1718 self.assertRaises(UnicodeError, 1719 codecs.encode, 'abc', 'undefined', errors) 1720 self.assertRaises(UnicodeError, 1721 codecs.decode, b'abc', 'undefined', errors) 1722 1723 def test_file_closes_if_lookup_error_raised(self): 1724 mock_open = mock.mock_open() 1725 with mock.patch('builtins.open', mock_open) as file: 1726 with self.assertRaises(LookupError): 1727 codecs.open(support.TESTFN, 'wt', 'invalid-encoding') 1728 1729 file().close.assert_called() 1730 1731 1732class StreamReaderTest(unittest.TestCase): 1733 1734 def setUp(self): 1735 self.reader = codecs.getreader('utf-8') 1736 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1737 1738 def test_readlines(self): 1739 f = self.reader(self.stream) 1740 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) 1741 1742 1743class EncodedFileTest(unittest.TestCase): 1744 1745 def test_basic(self): 1746 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1747 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1748 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae') 1749 1750 f = io.BytesIO() 1751 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1') 1752 ef.write(b'\xc3\xbc') 1753 self.assertEqual(f.getvalue(), b'\xfc') 1754 1755all_unicode_encodings = [ 1756 "ascii", 1757 "big5", 1758 "big5hkscs", 1759 "charmap", 1760 "cp037", 1761 "cp1006", 1762 "cp1026", 1763 "cp1125", 1764 "cp1140", 1765 "cp1250", 1766 "cp1251", 1767 "cp1252", 1768 "cp1253", 1769 "cp1254", 1770 "cp1255", 1771 "cp1256", 1772 "cp1257", 1773 "cp1258", 1774 "cp424", 1775 "cp437", 1776 "cp500", 1777 "cp720", 1778 "cp737", 1779 "cp775", 1780 "cp850", 1781 "cp852", 1782 "cp855", 1783 "cp856", 1784 "cp857", 1785 "cp858", 1786 "cp860", 1787 "cp861", 1788 "cp862", 1789 "cp863", 1790 "cp864", 1791 "cp865", 1792 "cp866", 1793 "cp869", 1794 "cp874", 1795 "cp875", 1796 "cp932", 1797 "cp949", 1798 "cp950", 1799 "euc_jis_2004", 1800 "euc_jisx0213", 1801 "euc_jp", 1802 "euc_kr", 1803 "gb18030", 1804 "gb2312", 1805 "gbk", 1806 "hp_roman8", 1807 "hz", 1808 "idna", 1809 "iso2022_jp", 1810 "iso2022_jp_1", 1811 "iso2022_jp_2", 1812 "iso2022_jp_2004", 1813 "iso2022_jp_3", 1814 "iso2022_jp_ext", 1815 "iso2022_kr", 1816 "iso8859_1", 1817 "iso8859_10", 1818 "iso8859_11", 1819 "iso8859_13", 1820 "iso8859_14", 1821 "iso8859_15", 1822 "iso8859_16", 1823 "iso8859_2", 1824 "iso8859_3", 1825 "iso8859_4", 1826 "iso8859_5", 1827 "iso8859_6", 1828 "iso8859_7", 1829 "iso8859_8", 1830 "iso8859_9", 1831 "johab", 1832 "koi8_r", 1833 "koi8_t", 1834 "koi8_u", 1835 "kz1048", 1836 "latin_1", 1837 "mac_cyrillic", 1838 "mac_greek", 1839 "mac_iceland", 1840 "mac_latin2", 1841 "mac_roman", 1842 "mac_turkish", 1843 "palmos", 1844 "ptcp154", 1845 "punycode", 1846 "raw_unicode_escape", 1847 "shift_jis", 1848 "shift_jis_2004", 1849 "shift_jisx0213", 1850 "tis_620", 1851 "unicode_escape", 1852 "utf_16", 1853 "utf_16_be", 1854 "utf_16_le", 1855 "utf_7", 1856 "utf_8", 1857] 1858 1859if hasattr(codecs, "mbcs_encode"): 1860 all_unicode_encodings.append("mbcs") 1861if hasattr(codecs, "oem_encode"): 1862 all_unicode_encodings.append("oem") 1863 1864# The following encoding is not tested, because it's not supposed 1865# to work: 1866# "undefined" 1867 1868# The following encodings don't work in stateful mode 1869broken_unicode_with_stateful = [ 1870 "punycode", 1871] 1872 1873 1874class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): 1875 def test_basics(self): 1876 s = "abc123" # all codecs should be able to encode these 1877 for encoding in all_unicode_encodings: 1878 name = codecs.lookup(encoding).name 1879 if encoding.endswith("_codec"): 1880 name += "_codec" 1881 elif encoding == "latin_1": 1882 name = "latin_1" 1883 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) 1884 1885 (b, size) = codecs.getencoder(encoding)(s) 1886 self.assertEqual(size, len(s), "encoding=%r" % encoding) 1887 (chars, size) = codecs.getdecoder(encoding)(b) 1888 self.assertEqual(chars, s, "encoding=%r" % encoding) 1889 1890 if encoding not in broken_unicode_with_stateful: 1891 # check stream reader/writer 1892 q = Queue(b"") 1893 writer = codecs.getwriter(encoding)(q) 1894 encodedresult = b"" 1895 for c in s: 1896 writer.write(c) 1897 chunk = q.read() 1898 self.assertTrue(type(chunk) is bytes, type(chunk)) 1899 encodedresult += chunk 1900 q = Queue(b"") 1901 reader = codecs.getreader(encoding)(q) 1902 decodedresult = "" 1903 for c in encodedresult: 1904 q.write(bytes([c])) 1905 decodedresult += reader.read() 1906 self.assertEqual(decodedresult, s, "encoding=%r" % encoding) 1907 1908 if encoding not in broken_unicode_with_stateful: 1909 # check incremental decoder/encoder and iterencode()/iterdecode() 1910 try: 1911 encoder = codecs.getincrementalencoder(encoding)() 1912 except LookupError: # no IncrementalEncoder 1913 pass 1914 else: 1915 # check incremental decoder/encoder 1916 encodedresult = b"" 1917 for c in s: 1918 encodedresult += encoder.encode(c) 1919 encodedresult += encoder.encode("", True) 1920 decoder = codecs.getincrementaldecoder(encoding)() 1921 decodedresult = "" 1922 for c in encodedresult: 1923 decodedresult += decoder.decode(bytes([c])) 1924 decodedresult += decoder.decode(b"", True) 1925 self.assertEqual(decodedresult, s, 1926 "encoding=%r" % encoding) 1927 1928 # check iterencode()/iterdecode() 1929 result = "".join(codecs.iterdecode( 1930 codecs.iterencode(s, encoding), encoding)) 1931 self.assertEqual(result, s, "encoding=%r" % encoding) 1932 1933 # check iterencode()/iterdecode() with empty string 1934 result = "".join(codecs.iterdecode( 1935 codecs.iterencode("", encoding), encoding)) 1936 self.assertEqual(result, "") 1937 1938 if encoding not in ("idna", "mbcs"): 1939 # check incremental decoder/encoder with errors argument 1940 try: 1941 encoder = codecs.getincrementalencoder(encoding)("ignore") 1942 except LookupError: # no IncrementalEncoder 1943 pass 1944 else: 1945 encodedresult = b"".join(encoder.encode(c) for c in s) 1946 decoder = codecs.getincrementaldecoder(encoding)("ignore") 1947 decodedresult = "".join(decoder.decode(bytes([c])) 1948 for c in encodedresult) 1949 self.assertEqual(decodedresult, s, 1950 "encoding=%r" % encoding) 1951 1952 @support.cpython_only 1953 def test_basics_capi(self): 1954 s = "abc123" # all codecs should be able to encode these 1955 for encoding in all_unicode_encodings: 1956 if encoding not in broken_unicode_with_stateful: 1957 # check incremental decoder/encoder (fetched via the C API) 1958 try: 1959 cencoder = _testcapi.codec_incrementalencoder(encoding) 1960 except LookupError: # no IncrementalEncoder 1961 pass 1962 else: 1963 # check C API 1964 encodedresult = b"" 1965 for c in s: 1966 encodedresult += cencoder.encode(c) 1967 encodedresult += cencoder.encode("", True) 1968 cdecoder = _testcapi.codec_incrementaldecoder(encoding) 1969 decodedresult = "" 1970 for c in encodedresult: 1971 decodedresult += cdecoder.decode(bytes([c])) 1972 decodedresult += cdecoder.decode(b"", True) 1973 self.assertEqual(decodedresult, s, 1974 "encoding=%r" % encoding) 1975 1976 if encoding not in ("idna", "mbcs"): 1977 # check incremental decoder/encoder with errors argument 1978 try: 1979 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") 1980 except LookupError: # no IncrementalEncoder 1981 pass 1982 else: 1983 encodedresult = b"".join(cencoder.encode(c) for c in s) 1984 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") 1985 decodedresult = "".join(cdecoder.decode(bytes([c])) 1986 for c in encodedresult) 1987 self.assertEqual(decodedresult, s, 1988 "encoding=%r" % encoding) 1989 1990 def test_seek(self): 1991 # all codecs should be able to encode these 1992 s = "%s\n%s\n" % (100*"abc123", 100*"def456") 1993 for encoding in all_unicode_encodings: 1994 if encoding == "idna": # FIXME: See SF bug #1163178 1995 continue 1996 if encoding in broken_unicode_with_stateful: 1997 continue 1998 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding))) 1999 for t in range(5): 2000 # Test that calling seek resets the internal codec state and buffers 2001 reader.seek(0, 0) 2002 data = reader.read() 2003 self.assertEqual(s, data) 2004 2005 def test_bad_decode_args(self): 2006 for encoding in all_unicode_encodings: 2007 decoder = codecs.getdecoder(encoding) 2008 self.assertRaises(TypeError, decoder) 2009 if encoding not in ("idna", "punycode"): 2010 self.assertRaises(TypeError, decoder, 42) 2011 2012 def test_bad_encode_args(self): 2013 for encoding in all_unicode_encodings: 2014 encoder = codecs.getencoder(encoding) 2015 self.assertRaises(TypeError, encoder) 2016 2017 def test_encoding_map_type_initialized(self): 2018 from encodings import cp1140 2019 # This used to crash, we are only verifying there's no crash. 2020 table_type = type(cp1140.encoding_table) 2021 self.assertEqual(table_type, table_type) 2022 2023 def test_decoder_state(self): 2024 # Check that getstate() and setstate() handle the state properly 2025 u = "abc123" 2026 for encoding in all_unicode_encodings: 2027 if encoding not in broken_unicode_with_stateful: 2028 self.check_state_handling_decode(encoding, u, u.encode(encoding)) 2029 self.check_state_handling_encode(encoding, u, u.encode(encoding)) 2030 2031 2032class CharmapTest(unittest.TestCase): 2033 def test_decode_with_string_map(self): 2034 self.assertEqual( 2035 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), 2036 ("abc", 3) 2037 ) 2038 2039 self.assertEqual( 2040 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"), 2041 ("\U0010FFFFbc", 3) 2042 ) 2043 2044 self.assertRaises(UnicodeDecodeError, 2045 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab" 2046 ) 2047 2048 self.assertRaises(UnicodeDecodeError, 2049 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe" 2050 ) 2051 2052 self.assertEqual( 2053 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"), 2054 ("ab\ufffd", 3) 2055 ) 2056 2057 self.assertEqual( 2058 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"), 2059 ("ab\ufffd", 3) 2060 ) 2061 2062 self.assertEqual( 2063 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"), 2064 ("ab\\x02", 3) 2065 ) 2066 2067 self.assertEqual( 2068 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"), 2069 ("ab\\x02", 3) 2070 ) 2071 2072 self.assertEqual( 2073 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"), 2074 ("ab", 3) 2075 ) 2076 2077 self.assertEqual( 2078 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"), 2079 ("ab", 3) 2080 ) 2081 2082 allbytes = bytes(range(256)) 2083 self.assertEqual( 2084 codecs.charmap_decode(allbytes, "ignore", ""), 2085 ("", len(allbytes)) 2086 ) 2087 2088 def test_decode_with_int2str_map(self): 2089 self.assertEqual( 2090 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2091 {0: 'a', 1: 'b', 2: 'c'}), 2092 ("abc", 3) 2093 ) 2094 2095 self.assertEqual( 2096 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2097 {0: 'Aa', 1: 'Bb', 2: 'Cc'}), 2098 ("AaBbCc", 3) 2099 ) 2100 2101 self.assertEqual( 2102 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2103 {0: '\U0010FFFF', 1: 'b', 2: 'c'}), 2104 ("\U0010FFFFbc", 3) 2105 ) 2106 2107 self.assertEqual( 2108 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2109 {0: 'a', 1: 'b', 2: ''}), 2110 ("ab", 3) 2111 ) 2112 2113 self.assertRaises(UnicodeDecodeError, 2114 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2115 {0: 'a', 1: 'b'} 2116 ) 2117 2118 self.assertRaises(UnicodeDecodeError, 2119 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2120 {0: 'a', 1: 'b', 2: None} 2121 ) 2122 2123 # Issue #14850 2124 self.assertRaises(UnicodeDecodeError, 2125 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2126 {0: 'a', 1: 'b', 2: '\ufffe'} 2127 ) 2128 2129 self.assertEqual( 2130 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2131 {0: 'a', 1: 'b'}), 2132 ("ab\ufffd", 3) 2133 ) 2134 2135 self.assertEqual( 2136 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2137 {0: 'a', 1: 'b', 2: None}), 2138 ("ab\ufffd", 3) 2139 ) 2140 2141 # Issue #14850 2142 self.assertEqual( 2143 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2144 {0: 'a', 1: 'b', 2: '\ufffe'}), 2145 ("ab\ufffd", 3) 2146 ) 2147 2148 self.assertEqual( 2149 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2150 {0: 'a', 1: 'b'}), 2151 ("ab\\x02", 3) 2152 ) 2153 2154 self.assertEqual( 2155 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2156 {0: 'a', 1: 'b', 2: None}), 2157 ("ab\\x02", 3) 2158 ) 2159 2160 # Issue #14850 2161 self.assertEqual( 2162 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2163 {0: 'a', 1: 'b', 2: '\ufffe'}), 2164 ("ab\\x02", 3) 2165 ) 2166 2167 self.assertEqual( 2168 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2169 {0: 'a', 1: 'b'}), 2170 ("ab", 3) 2171 ) 2172 2173 self.assertEqual( 2174 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2175 {0: 'a', 1: 'b', 2: None}), 2176 ("ab", 3) 2177 ) 2178 2179 # Issue #14850 2180 self.assertEqual( 2181 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2182 {0: 'a', 1: 'b', 2: '\ufffe'}), 2183 ("ab", 3) 2184 ) 2185 2186 allbytes = bytes(range(256)) 2187 self.assertEqual( 2188 codecs.charmap_decode(allbytes, "ignore", {}), 2189 ("", len(allbytes)) 2190 ) 2191 2192 self.assertRaisesRegex(TypeError, 2193 "character mapping must be in range\\(0x110000\\)", 2194 codecs.charmap_decode, 2195 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2} 2196 ) 2197 2198 self.assertRaisesRegex(TypeError, 2199 "character mapping must be in range\\(0x110000\\)", 2200 codecs.charmap_decode, 2201 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999} 2202 ) 2203 2204 def test_decode_with_int2int_map(self): 2205 a = ord('a') 2206 b = ord('b') 2207 c = ord('c') 2208 2209 self.assertEqual( 2210 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2211 {0: a, 1: b, 2: c}), 2212 ("abc", 3) 2213 ) 2214 2215 # Issue #15379 2216 self.assertEqual( 2217 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2218 {0: 0x10FFFF, 1: b, 2: c}), 2219 ("\U0010FFFFbc", 3) 2220 ) 2221 2222 self.assertEqual( 2223 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2224 {0: sys.maxunicode, 1: b, 2: c}), 2225 (chr(sys.maxunicode) + "bc", 3) 2226 ) 2227 2228 self.assertRaises(TypeError, 2229 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2230 {0: sys.maxunicode + 1, 1: b, 2: c} 2231 ) 2232 2233 self.assertRaises(UnicodeDecodeError, 2234 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2235 {0: a, 1: b}, 2236 ) 2237 2238 self.assertRaises(UnicodeDecodeError, 2239 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2240 {0: a, 1: b, 2: 0xFFFE}, 2241 ) 2242 2243 self.assertEqual( 2244 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2245 {0: a, 1: b}), 2246 ("ab\ufffd", 3) 2247 ) 2248 2249 self.assertEqual( 2250 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2251 {0: a, 1: b, 2: 0xFFFE}), 2252 ("ab\ufffd", 3) 2253 ) 2254 2255 self.assertEqual( 2256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2257 {0: a, 1: b}), 2258 ("ab\\x02", 3) 2259 ) 2260 2261 self.assertEqual( 2262 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2263 {0: a, 1: b, 2: 0xFFFE}), 2264 ("ab\\x02", 3) 2265 ) 2266 2267 self.assertEqual( 2268 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2269 {0: a, 1: b}), 2270 ("ab", 3) 2271 ) 2272 2273 self.assertEqual( 2274 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2275 {0: a, 1: b, 2: 0xFFFE}), 2276 ("ab", 3) 2277 ) 2278 2279 2280class WithStmtTest(unittest.TestCase): 2281 def test_encodedfile(self): 2282 f = io.BytesIO(b"\xc3\xbc") 2283 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 2284 self.assertEqual(ef.read(), b"\xfc") 2285 self.assertTrue(f.closed) 2286 2287 def test_streamreaderwriter(self): 2288 f = io.BytesIO(b"\xc3\xbc") 2289 info = codecs.lookup("utf-8") 2290 with codecs.StreamReaderWriter(f, info.streamreader, 2291 info.streamwriter, 'strict') as srw: 2292 self.assertEqual(srw.read(), "\xfc") 2293 2294 2295class TypesTest(unittest.TestCase): 2296 def test_decode_unicode(self): 2297 # Most decoders don't accept unicode input 2298 decoders = [ 2299 codecs.utf_7_decode, 2300 codecs.utf_8_decode, 2301 codecs.utf_16_le_decode, 2302 codecs.utf_16_be_decode, 2303 codecs.utf_16_ex_decode, 2304 codecs.utf_32_decode, 2305 codecs.utf_32_le_decode, 2306 codecs.utf_32_be_decode, 2307 codecs.utf_32_ex_decode, 2308 codecs.latin_1_decode, 2309 codecs.ascii_decode, 2310 codecs.charmap_decode, 2311 ] 2312 if hasattr(codecs, "mbcs_decode"): 2313 decoders.append(codecs.mbcs_decode) 2314 for decoder in decoders: 2315 self.assertRaises(TypeError, decoder, "xxx") 2316 2317 def test_unicode_escape(self): 2318 # Escape-decoding a unicode string is supported and gives the same 2319 # result as decoding the equivalent ASCII bytes string. 2320 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2321 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2322 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2323 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2324 2325 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000") 2326 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2327 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"), 2328 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2329 2330 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") 2331 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2332 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"), 2333 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2334 2335 2336class UnicodeEscapeTest(unittest.TestCase): 2337 def test_empty(self): 2338 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) 2339 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) 2340 2341 def test_raw_encode(self): 2342 encode = codecs.unicode_escape_encode 2343 for b in range(32, 127): 2344 if b != b'\\'[0]: 2345 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2346 2347 def test_raw_decode(self): 2348 decode = codecs.unicode_escape_decode 2349 for b in range(256): 2350 if b != b'\\'[0]: 2351 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2352 2353 def test_escape_encode(self): 2354 encode = codecs.unicode_escape_encode 2355 check = coding_checker(self, encode) 2356 check('\t', br'\t') 2357 check('\n', br'\n') 2358 check('\r', br'\r') 2359 check('\\', br'\\') 2360 for b in range(32): 2361 if chr(b) not in '\t\n\r': 2362 check(chr(b), ('\\x%02x' % b).encode()) 2363 for b in range(127, 256): 2364 check(chr(b), ('\\x%02x' % b).encode()) 2365 check('\u20ac', br'\u20ac') 2366 check('\U0001d120', br'\U0001d120') 2367 2368 def test_escape_decode(self): 2369 decode = codecs.unicode_escape_decode 2370 check = coding_checker(self, decode) 2371 check(b"[\\\n]", "[]") 2372 check(br'[\"]', '["]') 2373 check(br"[\']", "[']") 2374 check(br"[\\]", r"[\]") 2375 check(br"[\a]", "[\x07]") 2376 check(br"[\b]", "[\x08]") 2377 check(br"[\t]", "[\x09]") 2378 check(br"[\n]", "[\x0a]") 2379 check(br"[\v]", "[\x0b]") 2380 check(br"[\f]", "[\x0c]") 2381 check(br"[\r]", "[\x0d]") 2382 check(br"[\7]", "[\x07]") 2383 check(br"[\78]", "[\x078]") 2384 check(br"[\41]", "[!]") 2385 check(br"[\418]", "[!8]") 2386 check(br"[\101]", "[A]") 2387 check(br"[\1010]", "[A0]") 2388 check(br"[\x41]", "[A]") 2389 check(br"[\x410]", "[A0]") 2390 check(br"\u20ac", "\u20ac") 2391 check(br"\U0001d120", "\U0001d120") 2392 for i in range(97, 123): 2393 b = bytes([i]) 2394 if b not in b'abfnrtuvx': 2395 with self.assertWarns(DeprecationWarning): 2396 check(b"\\" + b, "\\" + chr(i)) 2397 if b.upper() not in b'UN': 2398 with self.assertWarns(DeprecationWarning): 2399 check(b"\\" + b.upper(), "\\" + chr(i-32)) 2400 with self.assertWarns(DeprecationWarning): 2401 check(br"\8", "\\8") 2402 with self.assertWarns(DeprecationWarning): 2403 check(br"\9", "\\9") 2404 with self.assertWarns(DeprecationWarning): 2405 check(b"\\\xfa", "\\\xfa") 2406 2407 def test_decode_errors(self): 2408 decode = codecs.unicode_escape_decode 2409 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 2410 for i in range(d): 2411 self.assertRaises(UnicodeDecodeError, decode, 2412 b"\\" + c + b"0"*i) 2413 self.assertRaises(UnicodeDecodeError, decode, 2414 b"[\\" + c + b"0"*i + b"]") 2415 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2416 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2417 self.assertEqual(decode(data, "replace"), 2418 ("[\ufffd]\ufffd", len(data))) 2419 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2420 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2421 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2422 2423 2424class RawUnicodeEscapeTest(unittest.TestCase): 2425 def test_empty(self): 2426 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) 2427 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) 2428 2429 def test_raw_encode(self): 2430 encode = codecs.raw_unicode_escape_encode 2431 for b in range(256): 2432 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2433 2434 def test_raw_decode(self): 2435 decode = codecs.raw_unicode_escape_decode 2436 for b in range(256): 2437 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2438 2439 def test_escape_encode(self): 2440 encode = codecs.raw_unicode_escape_encode 2441 check = coding_checker(self, encode) 2442 for b in range(256): 2443 if b not in b'uU': 2444 check('\\' + chr(b), b'\\' + bytes([b])) 2445 check('\u20ac', br'\u20ac') 2446 check('\U0001d120', br'\U0001d120') 2447 2448 def test_escape_decode(self): 2449 decode = codecs.raw_unicode_escape_decode 2450 check = coding_checker(self, decode) 2451 for b in range(256): 2452 if b not in b'uU': 2453 check(b'\\' + bytes([b]), '\\' + chr(b)) 2454 check(br"\u20ac", "\u20ac") 2455 check(br"\U0001d120", "\U0001d120") 2456 2457 def test_decode_errors(self): 2458 decode = codecs.raw_unicode_escape_decode 2459 for c, d in (b'u', 4), (b'U', 4): 2460 for i in range(d): 2461 self.assertRaises(UnicodeDecodeError, decode, 2462 b"\\" + c + b"0"*i) 2463 self.assertRaises(UnicodeDecodeError, decode, 2464 b"[\\" + c + b"0"*i + b"]") 2465 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2466 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2467 self.assertEqual(decode(data, "replace"), 2468 ("[\ufffd]\ufffd", len(data))) 2469 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2470 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2471 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2472 2473 2474class EscapeEncodeTest(unittest.TestCase): 2475 2476 def test_escape_encode(self): 2477 tests = [ 2478 (b'', (b'', 0)), 2479 (b'foobar', (b'foobar', 6)), 2480 (b'spam\0eggs', (b'spam\\x00eggs', 9)), 2481 (b'a\'b', (b"a\\'b", 3)), 2482 (b'b\\c', (b'b\\\\c', 3)), 2483 (b'c\nd', (b'c\\nd', 3)), 2484 (b'd\re', (b'd\\re', 3)), 2485 (b'f\x7fg', (b'f\\x7fg', 3)), 2486 ] 2487 for data, output in tests: 2488 with self.subTest(data=data): 2489 self.assertEqual(codecs.escape_encode(data), output) 2490 self.assertRaises(TypeError, codecs.escape_encode, 'spam') 2491 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam')) 2492 2493 2494class SurrogateEscapeTest(unittest.TestCase): 2495 2496 def test_utf8(self): 2497 # Bad byte 2498 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), 2499 "foo\udc80bar") 2500 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), 2501 b"foo\x80bar") 2502 # bad-utf-8 encoded surrogate 2503 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"), 2504 "\udced\udcb0\udc80") 2505 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"), 2506 b"\xed\xb0\x80") 2507 2508 def test_ascii(self): 2509 # bad byte 2510 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"), 2511 "foo\udc80bar") 2512 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), 2513 b"foo\x80bar") 2514 2515 def test_charmap(self): 2516 # bad byte: \xa5 is unmapped in iso-8859-3 2517 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), 2518 "foo\udca5bar") 2519 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"), 2520 b"foo\xa5bar") 2521 2522 def test_latin1(self): 2523 # Issue6373 2524 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"), 2525 b"\xe4\xeb\xef\xf6\xfc") 2526 2527 2528class BomTest(unittest.TestCase): 2529 def test_seek0(self): 2530 data = "1234567890" 2531 tests = ("utf-16", 2532 "utf-16-le", 2533 "utf-16-be", 2534 "utf-32", 2535 "utf-32-le", 2536 "utf-32-be") 2537 self.addCleanup(support.unlink, support.TESTFN) 2538 for encoding in tests: 2539 # Check if the BOM is written only once 2540 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2541 f.write(data) 2542 f.write(data) 2543 f.seek(0) 2544 self.assertEqual(f.read(), data * 2) 2545 f.seek(0) 2546 self.assertEqual(f.read(), data * 2) 2547 2548 # Check that the BOM is written after a seek(0) 2549 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2550 f.write(data[0]) 2551 self.assertNotEqual(f.tell(), 0) 2552 f.seek(0) 2553 f.write(data) 2554 f.seek(0) 2555 self.assertEqual(f.read(), data) 2556 2557 # (StreamWriter) Check that the BOM is written after a seek(0) 2558 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2559 f.writer.write(data[0]) 2560 self.assertNotEqual(f.writer.tell(), 0) 2561 f.writer.seek(0) 2562 f.writer.write(data) 2563 f.seek(0) 2564 self.assertEqual(f.read(), data) 2565 2566 # Check that the BOM is not written after a seek() at a position 2567 # different than the start 2568 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2569 f.write(data) 2570 f.seek(f.tell()) 2571 f.write(data) 2572 f.seek(0) 2573 self.assertEqual(f.read(), data * 2) 2574 2575 # (StreamWriter) Check that the BOM is not written after a seek() 2576 # at a position different than the start 2577 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2578 f.writer.write(data) 2579 f.writer.seek(f.writer.tell()) 2580 f.writer.write(data) 2581 f.seek(0) 2582 self.assertEqual(f.read(), data * 2) 2583 2584 2585bytes_transform_encodings = [ 2586 "base64_codec", 2587 "uu_codec", 2588 "quopri_codec", 2589 "hex_codec", 2590] 2591 2592transform_aliases = { 2593 "base64_codec": ["base64", "base_64"], 2594 "uu_codec": ["uu"], 2595 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"], 2596 "hex_codec": ["hex"], 2597 "rot_13": ["rot13"], 2598} 2599 2600try: 2601 import zlib 2602except ImportError: 2603 zlib = None 2604else: 2605 bytes_transform_encodings.append("zlib_codec") 2606 transform_aliases["zlib_codec"] = ["zip", "zlib"] 2607try: 2608 import bz2 2609except ImportError: 2610 pass 2611else: 2612 bytes_transform_encodings.append("bz2_codec") 2613 transform_aliases["bz2_codec"] = ["bz2"] 2614 2615 2616class TransformCodecTest(unittest.TestCase): 2617 2618 def test_basics(self): 2619 binput = bytes(range(256)) 2620 for encoding in bytes_transform_encodings: 2621 with self.subTest(encoding=encoding): 2622 # generic codecs interface 2623 (o, size) = codecs.getencoder(encoding)(binput) 2624 self.assertEqual(size, len(binput)) 2625 (i, size) = codecs.getdecoder(encoding)(o) 2626 self.assertEqual(size, len(o)) 2627 self.assertEqual(i, binput) 2628 2629 def test_read(self): 2630 for encoding in bytes_transform_encodings: 2631 with self.subTest(encoding=encoding): 2632 sin = codecs.encode(b"\x80", encoding) 2633 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2634 sout = reader.read() 2635 self.assertEqual(sout, b"\x80") 2636 2637 def test_readline(self): 2638 for encoding in bytes_transform_encodings: 2639 with self.subTest(encoding=encoding): 2640 sin = codecs.encode(b"\x80", encoding) 2641 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2642 sout = reader.readline() 2643 self.assertEqual(sout, b"\x80") 2644 2645 def test_buffer_api_usage(self): 2646 # We check all the transform codecs accept memoryview input 2647 # for encoding and decoding 2648 # and also that they roundtrip correctly 2649 original = b"12345\x80" 2650 for encoding in bytes_transform_encodings: 2651 with self.subTest(encoding=encoding): 2652 data = original 2653 view = memoryview(data) 2654 data = codecs.encode(data, encoding) 2655 view_encoded = codecs.encode(view, encoding) 2656 self.assertEqual(view_encoded, data) 2657 view = memoryview(data) 2658 data = codecs.decode(data, encoding) 2659 self.assertEqual(data, original) 2660 view_decoded = codecs.decode(view, encoding) 2661 self.assertEqual(view_decoded, data) 2662 2663 def test_text_to_binary_blacklists_binary_transforms(self): 2664 # Check binary -> binary codecs give a good error for str input 2665 bad_input = "bad input type" 2666 for encoding in bytes_transform_encodings: 2667 with self.subTest(encoding=encoding): 2668 fmt = (r"{!r} is not a text encoding; " 2669 r"use codecs.encode\(\) to handle arbitrary codecs") 2670 msg = fmt.format(encoding) 2671 with self.assertRaisesRegex(LookupError, msg) as failure: 2672 bad_input.encode(encoding) 2673 self.assertIsNone(failure.exception.__cause__) 2674 2675 def test_text_to_binary_blacklists_text_transforms(self): 2676 # Check str.encode gives a good error message for str -> str codecs 2677 msg = (r"^'rot_13' is not a text encoding; " 2678 r"use codecs.encode\(\) to handle arbitrary codecs") 2679 with self.assertRaisesRegex(LookupError, msg): 2680 "just an example message".encode("rot_13") 2681 2682 def test_binary_to_text_blacklists_binary_transforms(self): 2683 # Check bytes.decode and bytearray.decode give a good error 2684 # message for binary -> binary codecs 2685 data = b"encode first to ensure we meet any format restrictions" 2686 for encoding in bytes_transform_encodings: 2687 with self.subTest(encoding=encoding): 2688 encoded_data = codecs.encode(data, encoding) 2689 fmt = (r"{!r} is not a text encoding; " 2690 r"use codecs.decode\(\) to handle arbitrary codecs") 2691 msg = fmt.format(encoding) 2692 with self.assertRaisesRegex(LookupError, msg): 2693 encoded_data.decode(encoding) 2694 with self.assertRaisesRegex(LookupError, msg): 2695 bytearray(encoded_data).decode(encoding) 2696 2697 def test_binary_to_text_blacklists_text_transforms(self): 2698 # Check str -> str codec gives a good error for binary input 2699 for bad_input in (b"immutable", bytearray(b"mutable")): 2700 with self.subTest(bad_input=bad_input): 2701 msg = (r"^'rot_13' is not a text encoding; " 2702 r"use codecs.decode\(\) to handle arbitrary codecs") 2703 with self.assertRaisesRegex(LookupError, msg) as failure: 2704 bad_input.decode("rot_13") 2705 self.assertIsNone(failure.exception.__cause__) 2706 2707 @unittest.skipUnless(zlib, "Requires zlib support") 2708 def test_custom_zlib_error_is_wrapped(self): 2709 # Check zlib codec gives a good error for malformed input 2710 msg = "^decoding with 'zlib_codec' codec failed" 2711 with self.assertRaisesRegex(Exception, msg) as failure: 2712 codecs.decode(b"hello", "zlib_codec") 2713 self.assertIsInstance(failure.exception.__cause__, 2714 type(failure.exception)) 2715 2716 def test_custom_hex_error_is_wrapped(self): 2717 # Check hex codec gives a good error for malformed input 2718 msg = "^decoding with 'hex_codec' codec failed" 2719 with self.assertRaisesRegex(Exception, msg) as failure: 2720 codecs.decode(b"hello", "hex_codec") 2721 self.assertIsInstance(failure.exception.__cause__, 2722 type(failure.exception)) 2723 2724 # Unfortunately, the bz2 module throws OSError, which the codec 2725 # machinery currently can't wrap :( 2726 2727 # Ensure codec aliases from http://bugs.python.org/issue7475 work 2728 def test_aliases(self): 2729 for codec_name, aliases in transform_aliases.items(): 2730 expected_name = codecs.lookup(codec_name).name 2731 for alias in aliases: 2732 with self.subTest(alias=alias): 2733 info = codecs.lookup(alias) 2734 self.assertEqual(info.name, expected_name) 2735 2736 def test_quopri_stateless(self): 2737 # Should encode with quotetabs=True 2738 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec") 2739 self.assertEqual(encoded, b"space=20tab=09eol=20\n") 2740 # But should still support unescaped tabs and spaces 2741 unescaped = b"space tab eol\n" 2742 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped) 2743 2744 def test_uu_invalid(self): 2745 # Missing "begin" line 2746 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec") 2747 2748 2749# The codec system tries to wrap exceptions in order to ensure the error 2750# mentions the operation being performed and the codec involved. We 2751# currently *only* want this to happen for relatively stateless 2752# exceptions, where the only significant information they contain is their 2753# type and a single str argument. 2754 2755# Use a local codec registry to avoid appearing to leak objects when 2756# registering multiple search functions 2757_TEST_CODECS = {} 2758 2759def _get_test_codec(codec_name): 2760 return _TEST_CODECS.get(codec_name) 2761codecs.register(_get_test_codec) # Returns None, not usable as a decorator 2762 2763try: 2764 # Issue #22166: Also need to clear the internal cache in CPython 2765 from _codecs import _forget_codec 2766except ImportError: 2767 def _forget_codec(codec_name): 2768 pass 2769 2770 2771class ExceptionChainingTest(unittest.TestCase): 2772 2773 def setUp(self): 2774 # There's no way to unregister a codec search function, so we just 2775 # ensure we render this one fairly harmless after the test 2776 # case finishes by using the test case repr as the codec name 2777 # The codecs module normalizes codec names, although this doesn't 2778 # appear to be formally documented... 2779 # We also make sure we use a truly unique id for the custom codec 2780 # to avoid issues with the codec cache when running these tests 2781 # multiple times (e.g. when hunting for refleaks) 2782 unique_id = repr(self) + str(id(self)) 2783 self.codec_name = encodings.normalize_encoding(unique_id).lower() 2784 2785 # We store the object to raise on the instance because of a bad 2786 # interaction between the codec caching (which means we can't 2787 # recreate the codec entry) and regrtest refleak hunting (which 2788 # runs the same test instance multiple times). This means we 2789 # need to ensure the codecs call back in to the instance to find 2790 # out which exception to raise rather than binding them in a 2791 # closure to an object that may change on the next run 2792 self.obj_to_raise = RuntimeError 2793 2794 def tearDown(self): 2795 _TEST_CODECS.pop(self.codec_name, None) 2796 # Issue #22166: Also pop from caches to avoid appearance of ref leaks 2797 encodings._cache.pop(self.codec_name, None) 2798 try: 2799 _forget_codec(self.codec_name) 2800 except KeyError: 2801 pass 2802 2803 def set_codec(self, encode, decode): 2804 codec_info = codecs.CodecInfo(encode, decode, 2805 name=self.codec_name) 2806 _TEST_CODECS[self.codec_name] = codec_info 2807 2808 @contextlib.contextmanager 2809 def assertWrapped(self, operation, exc_type, msg): 2810 full_msg = r"{} with {!r} codec failed \({}: {}\)".format( 2811 operation, self.codec_name, exc_type.__name__, msg) 2812 with self.assertRaisesRegex(exc_type, full_msg) as caught: 2813 yield caught 2814 self.assertIsInstance(caught.exception.__cause__, exc_type) 2815 self.assertIsNotNone(caught.exception.__cause__.__traceback__) 2816 2817 def raise_obj(self, *args, **kwds): 2818 # Helper to dynamically change the object raised by a test codec 2819 raise self.obj_to_raise 2820 2821 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError): 2822 self.obj_to_raise = obj_to_raise 2823 self.set_codec(self.raise_obj, self.raise_obj) 2824 with self.assertWrapped("encoding", exc_type, msg): 2825 "str_input".encode(self.codec_name) 2826 with self.assertWrapped("encoding", exc_type, msg): 2827 codecs.encode("str_input", self.codec_name) 2828 with self.assertWrapped("decoding", exc_type, msg): 2829 b"bytes input".decode(self.codec_name) 2830 with self.assertWrapped("decoding", exc_type, msg): 2831 codecs.decode(b"bytes input", self.codec_name) 2832 2833 def test_raise_by_type(self): 2834 self.check_wrapped(RuntimeError, "") 2835 2836 def test_raise_by_value(self): 2837 msg = "This should be wrapped" 2838 self.check_wrapped(RuntimeError(msg), msg) 2839 2840 def test_raise_grandchild_subclass_exact_size(self): 2841 msg = "This should be wrapped" 2842 class MyRuntimeError(RuntimeError): 2843 __slots__ = () 2844 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2845 2846 def test_raise_subclass_with_weakref_support(self): 2847 msg = "This should be wrapped" 2848 class MyRuntimeError(RuntimeError): 2849 pass 2850 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2851 2852 def check_not_wrapped(self, obj_to_raise, msg): 2853 def raise_obj(*args, **kwds): 2854 raise obj_to_raise 2855 self.set_codec(raise_obj, raise_obj) 2856 with self.assertRaisesRegex(RuntimeError, msg): 2857 "str input".encode(self.codec_name) 2858 with self.assertRaisesRegex(RuntimeError, msg): 2859 codecs.encode("str input", self.codec_name) 2860 with self.assertRaisesRegex(RuntimeError, msg): 2861 b"bytes input".decode(self.codec_name) 2862 with self.assertRaisesRegex(RuntimeError, msg): 2863 codecs.decode(b"bytes input", self.codec_name) 2864 2865 def test_init_override_is_not_wrapped(self): 2866 class CustomInit(RuntimeError): 2867 def __init__(self): 2868 pass 2869 self.check_not_wrapped(CustomInit, "") 2870 2871 def test_new_override_is_not_wrapped(self): 2872 class CustomNew(RuntimeError): 2873 def __new__(cls): 2874 return super().__new__(cls) 2875 self.check_not_wrapped(CustomNew, "") 2876 2877 def test_instance_attribute_is_not_wrapped(self): 2878 msg = "This should NOT be wrapped" 2879 exc = RuntimeError(msg) 2880 exc.attr = 1 2881 self.check_not_wrapped(exc, "^{}$".format(msg)) 2882 2883 def test_non_str_arg_is_not_wrapped(self): 2884 self.check_not_wrapped(RuntimeError(1), "1") 2885 2886 def test_multiple_args_is_not_wrapped(self): 2887 msg_re = r"^\('a', 'b', 'c'\)$" 2888 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re) 2889 2890 # http://bugs.python.org/issue19609 2891 def test_codec_lookup_failure_not_wrapped(self): 2892 msg = "^unknown encoding: {}$".format(self.codec_name) 2893 # The initial codec lookup should not be wrapped 2894 with self.assertRaisesRegex(LookupError, msg): 2895 "str input".encode(self.codec_name) 2896 with self.assertRaisesRegex(LookupError, msg): 2897 codecs.encode("str input", self.codec_name) 2898 with self.assertRaisesRegex(LookupError, msg): 2899 b"bytes input".decode(self.codec_name) 2900 with self.assertRaisesRegex(LookupError, msg): 2901 codecs.decode(b"bytes input", self.codec_name) 2902 2903 def test_unflagged_non_text_codec_handling(self): 2904 # The stdlib non-text codecs are now marked so they're 2905 # pre-emptively skipped by the text model related methods 2906 # However, third party codecs won't be flagged, so we still make 2907 # sure the case where an inappropriate output type is produced is 2908 # handled appropriately 2909 def encode_to_str(*args, **kwds): 2910 return "not bytes!", 0 2911 def decode_to_bytes(*args, **kwds): 2912 return b"not str!", 0 2913 self.set_codec(encode_to_str, decode_to_bytes) 2914 # No input or output type checks on the codecs module functions 2915 encoded = codecs.encode(None, self.codec_name) 2916 self.assertEqual(encoded, "not bytes!") 2917 decoded = codecs.decode(None, self.codec_name) 2918 self.assertEqual(decoded, b"not str!") 2919 # Text model methods should complain 2920 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " 2921 r"use codecs.encode\(\) to encode to arbitrary types$") 2922 msg = fmt.format(self.codec_name) 2923 with self.assertRaisesRegex(TypeError, msg): 2924 "str_input".encode(self.codec_name) 2925 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " 2926 r"use codecs.decode\(\) to decode to arbitrary types$") 2927 msg = fmt.format(self.codec_name) 2928 with self.assertRaisesRegex(TypeError, msg): 2929 b"bytes input".decode(self.codec_name) 2930 2931 2932 2933@unittest.skipUnless(sys.platform == 'win32', 2934 'code pages are specific to Windows') 2935class CodePageTest(unittest.TestCase): 2936 CP_UTF8 = 65001 2937 2938 def test_invalid_code_page(self): 2939 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') 2940 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') 2941 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a') 2942 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a') 2943 2944 def test_code_page_name(self): 2945 self.assertRaisesRegex(UnicodeEncodeError, 'cp932', 2946 codecs.code_page_encode, 932, '\xff') 2947 self.assertRaisesRegex(UnicodeDecodeError, 'cp932', 2948 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) 2949 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', 2950 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) 2951 2952 def check_decode(self, cp, tests): 2953 for raw, errors, expected in tests: 2954 if expected is not None: 2955 try: 2956 decoded = codecs.code_page_decode(cp, raw, errors, True) 2957 except UnicodeDecodeError as err: 2958 self.fail('Unable to decode %a from "cp%s" with ' 2959 'errors=%r: %s' % (raw, cp, errors, err)) 2960 self.assertEqual(decoded[0], expected, 2961 '%a.decode("cp%s", %r)=%a != %a' 2962 % (raw, cp, errors, decoded[0], expected)) 2963 # assert 0 <= decoded[1] <= len(raw) 2964 self.assertGreaterEqual(decoded[1], 0) 2965 self.assertLessEqual(decoded[1], len(raw)) 2966 else: 2967 self.assertRaises(UnicodeDecodeError, 2968 codecs.code_page_decode, cp, raw, errors, True) 2969 2970 def check_encode(self, cp, tests): 2971 for text, errors, expected in tests: 2972 if expected is not None: 2973 try: 2974 encoded = codecs.code_page_encode(cp, text, errors) 2975 except UnicodeEncodeError as err: 2976 self.fail('Unable to encode %a to "cp%s" with ' 2977 'errors=%r: %s' % (text, cp, errors, err)) 2978 self.assertEqual(encoded[0], expected, 2979 '%a.encode("cp%s", %r)=%a != %a' 2980 % (text, cp, errors, encoded[0], expected)) 2981 self.assertEqual(encoded[1], len(text)) 2982 else: 2983 self.assertRaises(UnicodeEncodeError, 2984 codecs.code_page_encode, cp, text, errors) 2985 2986 def test_cp932(self): 2987 self.check_encode(932, ( 2988 ('abc', 'strict', b'abc'), 2989 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), 2990 # test error handlers 2991 ('\xff', 'strict', None), 2992 ('[\xff]', 'ignore', b'[]'), 2993 ('[\xff]', 'replace', b'[y]'), 2994 ('[\u20ac]', 'replace', b'[?]'), 2995 ('[\xff]', 'backslashreplace', b'[\\xff]'), 2996 ('[\xff]', 'namereplace', 2997 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), 2998 ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), 2999 ('\udcff', 'strict', None), 3000 ('[\udcff]', 'surrogateescape', b'[\xff]'), 3001 ('[\udcff]', 'surrogatepass', None), 3002 )) 3003 self.check_decode(932, ( 3004 (b'abc', 'strict', 'abc'), 3005 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), 3006 # invalid bytes 3007 (b'[\xff]', 'strict', None), 3008 (b'[\xff]', 'ignore', '[]'), 3009 (b'[\xff]', 'replace', '[\ufffd]'), 3010 (b'[\xff]', 'backslashreplace', '[\\xff]'), 3011 (b'[\xff]', 'surrogateescape', '[\udcff]'), 3012 (b'[\xff]', 'surrogatepass', None), 3013 (b'\x81\x00abc', 'strict', None), 3014 (b'\x81\x00abc', 'ignore', '\x00abc'), 3015 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), 3016 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), 3017 )) 3018 3019 def test_cp1252(self): 3020 self.check_encode(1252, ( 3021 ('abc', 'strict', b'abc'), 3022 ('\xe9\u20ac', 'strict', b'\xe9\x80'), 3023 ('\xff', 'strict', b'\xff'), 3024 # test error handlers 3025 ('\u0141', 'strict', None), 3026 ('\u0141', 'ignore', b''), 3027 ('\u0141', 'replace', b'L'), 3028 ('\udc98', 'surrogateescape', b'\x98'), 3029 ('\udc98', 'surrogatepass', None), 3030 )) 3031 self.check_decode(1252, ( 3032 (b'abc', 'strict', 'abc'), 3033 (b'\xe9\x80', 'strict', '\xe9\u20ac'), 3034 (b'\xff', 'strict', '\xff'), 3035 )) 3036 3037 def test_cp_utf7(self): 3038 cp = 65000 3039 self.check_encode(cp, ( 3040 ('abc', 'strict', b'abc'), 3041 ('\xe9\u20ac', 'strict', b'+AOkgrA-'), 3042 ('\U0010ffff', 'strict', b'+2//f/w-'), 3043 ('\udc80', 'strict', b'+3IA-'), 3044 ('\ufffd', 'strict', b'+//0-'), 3045 )) 3046 self.check_decode(cp, ( 3047 (b'abc', 'strict', 'abc'), 3048 (b'+AOkgrA-', 'strict', '\xe9\u20ac'), 3049 (b'+2//f/w-', 'strict', '\U0010ffff'), 3050 (b'+3IA-', 'strict', '\udc80'), 3051 (b'+//0-', 'strict', '\ufffd'), 3052 # invalid bytes 3053 (b'[+/]', 'strict', '[]'), 3054 (b'[\xff]', 'strict', '[\xff]'), 3055 )) 3056 3057 def test_multibyte_encoding(self): 3058 self.check_decode(932, ( 3059 (b'\x84\xe9\x80', 'ignore', '\u9a3e'), 3060 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), 3061 )) 3062 self.check_decode(self.CP_UTF8, ( 3063 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), 3064 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), 3065 )) 3066 self.check_encode(self.CP_UTF8, ( 3067 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), 3068 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), 3069 )) 3070 3071 def test_code_page_decode_flags(self): 3072 # Issue #36312: For some code pages (e.g. UTF-7) flags for 3073 # MultiByteToWideChar() must be set to 0. 3074 if support.verbose: 3075 sys.stdout.write('\n') 3076 for cp in (50220, 50221, 50222, 50225, 50227, 50229, 3077 *range(57002, 57011+1), 65000): 3078 # On small versions of Windows like Windows IoT 3079 # not all codepages are present. 3080 # A missing codepage causes an OSError exception 3081 # so check for the codepage before decoding 3082 if is_code_page_present(cp): 3083 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}') 3084 else: 3085 if support.verbose: 3086 print(f" skipping cp={cp}") 3087 self.assertEqual(codecs.code_page_decode(42, b'abc'), 3088 ('\uf061\uf062\uf063', 3)) 3089 3090 def test_incremental(self): 3091 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) 3092 self.assertEqual(decoded, ('', 0)) 3093 3094 decoded = codecs.code_page_decode(932, 3095 b'\xe9\x80\xe9', 'strict', 3096 False) 3097 self.assertEqual(decoded, ('\u9a3e', 2)) 3098 3099 decoded = codecs.code_page_decode(932, 3100 b'\xe9\x80\xe9\x80', 'strict', 3101 False) 3102 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 3103 3104 decoded = codecs.code_page_decode(932, 3105 b'abc', 'strict', 3106 False) 3107 self.assertEqual(decoded, ('abc', 3)) 3108 3109 def test_mbcs_alias(self): 3110 # Check that looking up our 'default' codepage will return 3111 # mbcs when we don't have a more specific one available 3112 with mock.patch('_winapi.GetACP', return_value=123): 3113 codec = codecs.lookup('cp123') 3114 self.assertEqual(codec.name, 'mbcs') 3115 3116 @support.bigmemtest(size=2**31, memuse=7, dry_run=False) 3117 def test_large_input(self, size): 3118 # Test input longer than INT_MAX. 3119 # Input should contain undecodable bytes before and after 3120 # the INT_MAX limit. 3121 encoded = (b'01234567' * ((size//8)-1) + 3122 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff') 3123 self.assertEqual(len(encoded), size+2) 3124 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True) 3125 self.assertEqual(decoded[1], len(encoded)) 3126 del encoded 3127 self.assertEqual(len(decoded[0]), decoded[1]) 3128 self.assertEqual(decoded[0][:10], '0123456701') 3129 self.assertEqual(decoded[0][-20:], 3130 '6701234567' 3131 '\udc85\udc86\udcea\udceb\udcec' 3132 '\udcef\udcfc\udcfd\udcfe\udcff') 3133 3134 @support.bigmemtest(size=2**31, memuse=6, dry_run=False) 3135 def test_large_utf8_input(self, size): 3136 # Test input longer than INT_MAX. 3137 # Input should contain a decodable multi-byte character 3138 # surrounding INT_MAX 3139 encoded = (b'0123456\xed\x84\x80' * (size//8)) 3140 self.assertEqual(len(encoded), size // 8 * 10) 3141 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True) 3142 self.assertEqual(decoded[1], len(encoded)) 3143 del encoded 3144 self.assertEqual(len(decoded[0]), size) 3145 self.assertEqual(decoded[0][:10], '0123456\ud10001') 3146 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100') 3147 3148 3149class ASCIITest(unittest.TestCase): 3150 def test_encode(self): 3151 self.assertEqual('abc123'.encode('ascii'), b'abc123') 3152 3153 def test_encode_error(self): 3154 for data, error_handler, expected in ( 3155 ('[\x80\xff\u20ac]', 'ignore', b'[]'), 3156 ('[\x80\xff\u20ac]', 'replace', b'[???]'), 3157 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), 3158 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', 3159 b'[\\x80\\xff\\u20ac\\U000abcde]'), 3160 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3161 ): 3162 with self.subTest(data=data, error_handler=error_handler, 3163 expected=expected): 3164 self.assertEqual(data.encode('ascii', error_handler), 3165 expected) 3166 3167 def test_encode_surrogateescape_error(self): 3168 with self.assertRaises(UnicodeEncodeError): 3169 # the first character can be decoded, but not the second 3170 '\udc80\xff'.encode('ascii', 'surrogateescape') 3171 3172 def test_decode(self): 3173 self.assertEqual(b'abc'.decode('ascii'), 'abc') 3174 3175 def test_decode_error(self): 3176 for data, error_handler, expected in ( 3177 (b'[\x80\xff]', 'ignore', '[]'), 3178 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 3179 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 3180 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 3181 ): 3182 with self.subTest(data=data, error_handler=error_handler, 3183 expected=expected): 3184 self.assertEqual(data.decode('ascii', error_handler), 3185 expected) 3186 3187 3188class Latin1Test(unittest.TestCase): 3189 def test_encode(self): 3190 for data, expected in ( 3191 ('abc', b'abc'), 3192 ('\x80\xe9\xff', b'\x80\xe9\xff'), 3193 ): 3194 with self.subTest(data=data, expected=expected): 3195 self.assertEqual(data.encode('latin1'), expected) 3196 3197 def test_encode_errors(self): 3198 for data, error_handler, expected in ( 3199 ('[\u20ac\udc80]', 'ignore', b'[]'), 3200 ('[\u20ac\udc80]', 'replace', b'[??]'), 3201 ('[\u20ac\U000abcde]', 'backslashreplace', 3202 b'[\\u20ac\\U000abcde]'), 3203 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), 3204 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3205 ): 3206 with self.subTest(data=data, error_handler=error_handler, 3207 expected=expected): 3208 self.assertEqual(data.encode('latin1', error_handler), 3209 expected) 3210 3211 def test_encode_surrogateescape_error(self): 3212 with self.assertRaises(UnicodeEncodeError): 3213 # the first character can be decoded, but not the second 3214 '\udc80\u20ac'.encode('latin1', 'surrogateescape') 3215 3216 def test_decode(self): 3217 for data, expected in ( 3218 (b'abc', 'abc'), 3219 (b'[\x80\xff]', '[\x80\xff]'), 3220 ): 3221 with self.subTest(data=data, expected=expected): 3222 self.assertEqual(data.decode('latin1'), expected) 3223 3224 3225class StreamRecoderTest(unittest.TestCase): 3226 def test_writelines(self): 3227 bio = io.BytesIO() 3228 codec = codecs.lookup('ascii') 3229 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3230 encodings.ascii.StreamReader, encodings.ascii.StreamWriter) 3231 sr.writelines([b'a', b'b']) 3232 self.assertEqual(bio.getvalue(), b'ab') 3233 3234 def test_write(self): 3235 bio = io.BytesIO() 3236 codec = codecs.lookup('latin1') 3237 # Recode from Latin-1 to utf-8. 3238 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3239 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter) 3240 3241 text = 'àñé' 3242 sr.write(text.encode('latin1')) 3243 self.assertEqual(bio.getvalue(), text.encode('utf-8')) 3244 3245 def test_seeking_read(self): 3246 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le')) 3247 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3248 3249 self.assertEqual(sr.readline(), b'line1\n') 3250 sr.seek(0) 3251 self.assertEqual(sr.readline(), b'line1\n') 3252 self.assertEqual(sr.readline(), b'line2\n') 3253 self.assertEqual(sr.readline(), b'line3\n') 3254 self.assertEqual(sr.readline(), b'') 3255 3256 def test_seeking_write(self): 3257 bio = io.BytesIO('123456789\n'.encode('utf-16-le')) 3258 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3259 3260 # Test that seek() only resets its internal buffer when offset 3261 # and whence are zero. 3262 sr.seek(2) 3263 sr.write(b'\nabc\n') 3264 self.assertEqual(sr.readline(), b'789\n') 3265 sr.seek(0) 3266 self.assertEqual(sr.readline(), b'1\n') 3267 self.assertEqual(sr.readline(), b'abc\n') 3268 self.assertEqual(sr.readline(), b'789\n') 3269 3270 3271@unittest.skipIf(_testcapi is None, 'need _testcapi module') 3272class LocaleCodecTest(unittest.TestCase): 3273 """ 3274 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex(). 3275 """ 3276 ENCODING = sys.getfilesystemencoding() 3277 STRINGS = ("ascii", "ulatin1:\xa7\xe9", 3278 "u255:\xff", 3279 "UCS:\xe9\u20ac\U0010ffff", 3280 "surrogates:\uDC80\uDCFF") 3281 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff") 3282 SURROGATES = "\uDC80\uDCFF" 3283 3284 def encode(self, text, errors="strict"): 3285 return _testcapi.EncodeLocaleEx(text, 0, errors) 3286 3287 def check_encode_strings(self, errors): 3288 for text in self.STRINGS: 3289 with self.subTest(text=text): 3290 try: 3291 expected = text.encode(self.ENCODING, errors) 3292 except UnicodeEncodeError: 3293 with self.assertRaises(RuntimeError) as cm: 3294 self.encode(text, errors) 3295 errmsg = str(cm.exception) 3296 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=") 3297 else: 3298 encoded = self.encode(text, errors) 3299 self.assertEqual(encoded, expected) 3300 3301 def test_encode_strict(self): 3302 self.check_encode_strings("strict") 3303 3304 def test_encode_surrogateescape(self): 3305 self.check_encode_strings("surrogateescape") 3306 3307 def test_encode_surrogatepass(self): 3308 try: 3309 self.encode('', 'surrogatepass') 3310 except ValueError as exc: 3311 if str(exc) == 'unsupported error handler': 3312 self.skipTest(f"{self.ENCODING!r} encoder doesn't support " 3313 f"surrogatepass error handler") 3314 else: 3315 raise 3316 3317 self.check_encode_strings("surrogatepass") 3318 3319 def test_encode_unsupported_error_handler(self): 3320 with self.assertRaises(ValueError) as cm: 3321 self.encode('', 'backslashreplace') 3322 self.assertEqual(str(cm.exception), 'unsupported error handler') 3323 3324 def decode(self, encoded, errors="strict"): 3325 return _testcapi.DecodeLocaleEx(encoded, 0, errors) 3326 3327 def check_decode_strings(self, errors): 3328 is_utf8 = (self.ENCODING == "utf-8") 3329 if is_utf8: 3330 encode_errors = 'surrogateescape' 3331 else: 3332 encode_errors = 'strict' 3333 3334 strings = list(self.BYTES_STRINGS) 3335 for text in self.STRINGS: 3336 try: 3337 encoded = text.encode(self.ENCODING, encode_errors) 3338 if encoded not in strings: 3339 strings.append(encoded) 3340 except UnicodeEncodeError: 3341 encoded = None 3342 3343 if is_utf8: 3344 encoded2 = text.encode(self.ENCODING, 'surrogatepass') 3345 if encoded2 != encoded: 3346 strings.append(encoded2) 3347 3348 for encoded in strings: 3349 with self.subTest(encoded=encoded): 3350 try: 3351 expected = encoded.decode(self.ENCODING, errors) 3352 except UnicodeDecodeError: 3353 with self.assertRaises(RuntimeError) as cm: 3354 self.decode(encoded, errors) 3355 errmsg = str(cm.exception) 3356 self.assertTrue(errmsg.startswith("decode error: "), errmsg) 3357 else: 3358 decoded = self.decode(encoded, errors) 3359 self.assertEqual(decoded, expected) 3360 3361 def test_decode_strict(self): 3362 self.check_decode_strings("strict") 3363 3364 def test_decode_surrogateescape(self): 3365 self.check_decode_strings("surrogateescape") 3366 3367 def test_decode_surrogatepass(self): 3368 try: 3369 self.decode(b'', 'surrogatepass') 3370 except ValueError as exc: 3371 if str(exc) == 'unsupported error handler': 3372 self.skipTest(f"{self.ENCODING!r} decoder doesn't support " 3373 f"surrogatepass error handler") 3374 else: 3375 raise 3376 3377 self.check_decode_strings("surrogatepass") 3378 3379 def test_decode_unsupported_error_handler(self): 3380 with self.assertRaises(ValueError) as cm: 3381 self.decode(b'', 'backslashreplace') 3382 self.assertEqual(str(cm.exception), 'unsupported error handler') 3383 3384 3385class Rot13Test(unittest.TestCase): 3386 """Test the educational ROT-13 codec.""" 3387 def test_encode(self): 3388 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13') 3389 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref') 3390 3391 def test_decode(self): 3392 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13') 3393 self.assertEqual(plaintext, 'Et tu, Brute?') 3394 3395 def test_incremental_encode(self): 3396 encoder = codecs.getincrementalencoder('rot-13')() 3397 ciphertext = encoder.encode('ABBA nag Cheryl Baker') 3398 self.assertEqual(ciphertext, 'NOON ant Purely Onxre') 3399 3400 def test_incremental_decode(self): 3401 decoder = codecs.getincrementaldecoder('rot-13')() 3402 plaintext = decoder.decode('terra Ares envy tha') 3403 self.assertEqual(plaintext, 'green Nerf rail gun') 3404 3405 3406class Rot13UtilTest(unittest.TestCase): 3407 """Test the ROT-13 codec via rot13 function, 3408 i.e. the user has done something like: 3409 $ echo "Hello World" | python -m encodings.rot_13 3410 """ 3411 def test_rot13_func(self): 3412 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba') 3413 outfile = io.StringIO() 3414 encodings.rot_13.rot13(infile, outfile) 3415 outfile.seek(0) 3416 plain_text = outfile.read() 3417 self.assertEqual( 3418 plain_text, 3419 'To be, or not to be, that is the question') 3420 3421 3422if __name__ == "__main__": 3423 unittest.main() 3424