1import codecs 2import html.entities 3import itertools 4import sys 5import unicodedata 6import unittest 7 8 9class PosReturn: 10 # this can be used for configurable callbacks 11 12 def __init__(self): 13 self.pos = 0 14 15 def handle(self, exc): 16 oldpos = self.pos 17 realpos = oldpos 18 if realpos<0: 19 realpos = len(exc.object) + realpos 20 # if we don't advance this time, terminate on the next call 21 # otherwise we'd get an endless loop 22 if realpos <= exc.start: 23 self.pos = len(exc.object) 24 return ("<?>", oldpos) 25 26class RepeatedPosReturn: 27 def __init__(self, repl="<?>"): 28 self.repl = repl 29 self.pos = 0 30 self.count = 0 31 32 def handle(self, exc): 33 if self.count > 0: 34 self.count -= 1 35 return (self.repl, self.pos) 36 return (self.repl, exc.end) 37 38# A UnicodeEncodeError object with a bad start attribute 39class BadStartUnicodeEncodeError(UnicodeEncodeError): 40 def __init__(self): 41 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 42 self.start = [] 43 44# A UnicodeEncodeError object with a bad object attribute 45class BadObjectUnicodeEncodeError(UnicodeEncodeError): 46 def __init__(self): 47 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 48 self.object = [] 49 50# A UnicodeDecodeError object without an end attribute 51class NoEndUnicodeDecodeError(UnicodeDecodeError): 52 def __init__(self): 53 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 54 del self.end 55 56# A UnicodeDecodeError object with a bad object attribute 57class BadObjectUnicodeDecodeError(UnicodeDecodeError): 58 def __init__(self): 59 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 60 self.object = [] 61 62# A UnicodeTranslateError object without a start attribute 63class NoStartUnicodeTranslateError(UnicodeTranslateError): 64 def __init__(self): 65 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 66 del self.start 67 68# A UnicodeTranslateError object without an end attribute 69class NoEndUnicodeTranslateError(UnicodeTranslateError): 70 def __init__(self): 71 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 72 del self.end 73 74# A UnicodeTranslateError object without an object attribute 75class NoObjectUnicodeTranslateError(UnicodeTranslateError): 76 def __init__(self): 77 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 78 del self.object 79 80class CodecCallbackTest(unittest.TestCase): 81 82 def test_xmlcharrefreplace(self): 83 # replace unencodable characters which numeric character entities. 84 # For ascii, latin-1 and charmaps this is completely implemented 85 # in C and should be reasonably fast. 86 s = "\u30b9\u30d1\u30e2 \xe4nd eggs" 87 self.assertEqual( 88 s.encode("ascii", "xmlcharrefreplace"), 89 b"スパモ änd eggs" 90 ) 91 self.assertEqual( 92 s.encode("latin-1", "xmlcharrefreplace"), 93 b"スパモ \xe4nd eggs" 94 ) 95 96 def test_xmlcharnamereplace(self): 97 # This time use a named character entity for unencodable 98 # characters, if one is available. 99 100 def xmlcharnamereplace(exc): 101 if not isinstance(exc, UnicodeEncodeError): 102 raise TypeError("don't know how to handle %r" % exc) 103 l = [] 104 for c in exc.object[exc.start:exc.end]: 105 try: 106 l.append("&%s;" % html.entities.codepoint2name[ord(c)]) 107 except KeyError: 108 l.append("&#%d;" % ord(c)) 109 return ("".join(l), exc.end) 110 111 codecs.register_error( 112 "test.xmlcharnamereplace", xmlcharnamereplace) 113 114 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" 115 sout = b"«ℜ» = ⟨ሴ€⟩" 116 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) 117 sout = b"\xabℜ\xbb = ⟨ሴ€⟩" 118 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) 119 sout = b"\xabℜ\xbb = ⟨ሴ\xa4⟩" 120 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) 121 122 def test_uninamereplace(self): 123 # We're using the names from the unicode database this time, 124 # and we're doing "syntax highlighting" here, i.e. we include 125 # the replaced text in ANSI escape sequences. For this it is 126 # useful that the error handler is not called for every single 127 # unencodable character, but for a complete sequence of 128 # unencodable characters, otherwise we would output many 129 # unnecessary escape sequences. 130 131 def uninamereplace(exc): 132 if not isinstance(exc, UnicodeEncodeError): 133 raise TypeError("don't know how to handle %r" % exc) 134 l = [] 135 for c in exc.object[exc.start:exc.end]: 136 l.append(unicodedata.name(c, "0x%x" % ord(c))) 137 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end) 138 139 codecs.register_error( 140 "test.uninamereplace", uninamereplace) 141 142 sin = "\xac\u1234\u20ac\u8000" 143 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 144 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) 145 146 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 147 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) 148 149 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" 150 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) 151 152 def test_backslashescape(self): 153 # Does the same as the "unicode-escape" encoding, but with different 154 # base encodings. 155 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 156 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" 157 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) 158 159 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" 160 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) 161 162 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" 163 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) 164 165 def test_nameescape(self): 166 # Does the same as backslashescape, but prefers ``\N{...}`` escape 167 # sequences. 168 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 169 sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 170 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 171 self.assertEqual(sin.encode("ascii", "namereplace"), sout) 172 173 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 174 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 175 self.assertEqual(sin.encode("latin-1", "namereplace"), sout) 176 177 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' 178 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 179 self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) 180 181 def test_decoding_callbacks(self): 182 # This is a test for a decoding callback handler 183 # that allows the decoding of the invalid sequence 184 # "\xc0\x80" and returns "\x00" instead of raising an error. 185 # All other illegal sequences will be handled strictly. 186 def relaxedutf8(exc): 187 if not isinstance(exc, UnicodeDecodeError): 188 raise TypeError("don't know how to handle %r" % exc) 189 if exc.object[exc.start:exc.start+2] == b"\xc0\x80": 190 return ("\x00", exc.start+2) # retry after two bytes 191 else: 192 raise exc 193 194 codecs.register_error("test.relaxedutf8", relaxedutf8) 195 196 # all the "\xc0\x80" will be decoded to "\x00" 197 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" 198 sout = "a\x00b\x00c\xfc\x00\x00" 199 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) 200 201 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised 202 sin = b"\xc0\x80\xc0\x81" 203 self.assertRaises(UnicodeDecodeError, sin.decode, 204 "utf-8", "test.relaxedutf8") 205 206 def test_charmapencode(self): 207 # For charmap encodings the replacement string will be 208 # mapped through the encoding again. This means, that 209 # to be able to use e.g. the "replace" handler, the 210 # charmap has to have a mapping for "?". 211 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh") 212 sin = "abc" 213 sout = b"AABBCC" 214 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout) 215 216 sin = "abcA" 217 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) 218 219 charmap[ord("?")] = b"XYZ" 220 sin = "abcDEF" 221 sout = b"AABBCCXYZXYZXYZ" 222 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout) 223 224 charmap[ord("?")] = "XYZ" # wrong type in mapping 225 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) 226 227 def test_callbacks(self): 228 def handler1(exc): 229 r = range(exc.start, exc.end) 230 if isinstance(exc, UnicodeEncodeError): 231 l = ["<%d>" % ord(exc.object[pos]) for pos in r] 232 elif isinstance(exc, UnicodeDecodeError): 233 l = ["<%d>" % exc.object[pos] for pos in r] 234 else: 235 raise TypeError("don't know how to handle %r" % exc) 236 return ("[%s]" % "".join(l), exc.end) 237 238 codecs.register_error("test.handler1", handler1) 239 240 def handler2(exc): 241 if not isinstance(exc, UnicodeDecodeError): 242 raise TypeError("don't know how to handle %r" % exc) 243 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)] 244 return ("[%s]" % "".join(l), exc.end+1) # skip one character 245 246 codecs.register_error("test.handler2", handler2) 247 248 s = b"\x00\x81\x7f\x80\xff" 249 250 self.assertEqual( 251 s.decode("ascii", "test.handler1"), 252 "\x00[<129>]\x7f[<128>][<255>]" 253 ) 254 self.assertEqual( 255 s.decode("ascii", "test.handler2"), 256 "\x00[<129>][<128>]" 257 ) 258 259 self.assertEqual( 260 b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"), 261 "\u3042[<92><117><51>]xxx" 262 ) 263 264 self.assertEqual( 265 b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"), 266 "\u3042[<92><117><51>]xx" 267 ) 268 269 self.assertEqual( 270 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], 271 "z[<98>][<99>]" 272 ) 273 274 self.assertEqual( 275 "g\xfc\xdfrk".encode("ascii", "test.handler1"), 276 b"g[<252><223>]rk" 277 ) 278 279 self.assertEqual( 280 "g\xfc\xdf".encode("ascii", "test.handler1"), 281 b"g[<252><223>]" 282 ) 283 284 def test_longstrings(self): 285 # test long strings to check for memory overflow problems 286 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", 287 "backslashreplace", "namereplace"] 288 # register the handlers under different names, 289 # to prevent the codec from recognizing the name 290 for err in errors: 291 codecs.register_error("test." + err, codecs.lookup_error(err)) 292 l = 1000 293 errors += [ "test." + err for err in errors ] 294 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: 295 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", 296 "utf-8", "utf-7", "utf-16", "utf-32"): 297 for err in errors: 298 try: 299 uni.encode(enc, err) 300 except UnicodeError: 301 pass 302 303 def check_exceptionobjectargs(self, exctype, args, msg): 304 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion 305 # check with one missing argument 306 self.assertRaises(TypeError, exctype, *args[:-1]) 307 # check with one argument too much 308 self.assertRaises(TypeError, exctype, *(args + ["too much"])) 309 # check with one argument of the wrong type 310 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ] 311 for i in range(len(args)): 312 for wrongarg in wrongargs: 313 if type(wrongarg) is type(args[i]): 314 continue 315 # build argument array 316 callargs = [] 317 for j in range(len(args)): 318 if i==j: 319 callargs.append(wrongarg) 320 else: 321 callargs.append(args[i]) 322 self.assertRaises(TypeError, exctype, *callargs) 323 324 # check with the correct number and type of arguments 325 exc = exctype(*args) 326 self.assertEqual(str(exc), msg) 327 328 def test_unicodeencodeerror(self): 329 self.check_exceptionobjectargs( 330 UnicodeEncodeError, 331 ["ascii", "g\xfcrk", 1, 2, "ouch"], 332 "'ascii' codec can't encode character '\\xfc' in position 1: ouch" 333 ) 334 self.check_exceptionobjectargs( 335 UnicodeEncodeError, 336 ["ascii", "g\xfcrk", 1, 4, "ouch"], 337 "'ascii' codec can't encode characters in position 1-3: ouch" 338 ) 339 self.check_exceptionobjectargs( 340 UnicodeEncodeError, 341 ["ascii", "\xfcx", 0, 1, "ouch"], 342 "'ascii' codec can't encode character '\\xfc' in position 0: ouch" 343 ) 344 self.check_exceptionobjectargs( 345 UnicodeEncodeError, 346 ["ascii", "\u0100x", 0, 1, "ouch"], 347 "'ascii' codec can't encode character '\\u0100' in position 0: ouch" 348 ) 349 self.check_exceptionobjectargs( 350 UnicodeEncodeError, 351 ["ascii", "\uffffx", 0, 1, "ouch"], 352 "'ascii' codec can't encode character '\\uffff' in position 0: ouch" 353 ) 354 self.check_exceptionobjectargs( 355 UnicodeEncodeError, 356 ["ascii", "\U00010000x", 0, 1, "ouch"], 357 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" 358 ) 359 360 def test_unicodedecodeerror(self): 361 self.check_exceptionobjectargs( 362 UnicodeDecodeError, 363 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"], 364 "'ascii' codec can't decode byte 0xfc in position 1: ouch" 365 ) 366 self.check_exceptionobjectargs( 367 UnicodeDecodeError, 368 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"], 369 "'ascii' codec can't decode bytes in position 1-2: ouch" 370 ) 371 372 def test_unicodetranslateerror(self): 373 self.check_exceptionobjectargs( 374 UnicodeTranslateError, 375 ["g\xfcrk", 1, 2, "ouch"], 376 "can't translate character '\\xfc' in position 1: ouch" 377 ) 378 self.check_exceptionobjectargs( 379 UnicodeTranslateError, 380 ["g\u0100rk", 1, 2, "ouch"], 381 "can't translate character '\\u0100' in position 1: ouch" 382 ) 383 self.check_exceptionobjectargs( 384 UnicodeTranslateError, 385 ["g\uffffrk", 1, 2, "ouch"], 386 "can't translate character '\\uffff' in position 1: ouch" 387 ) 388 self.check_exceptionobjectargs( 389 UnicodeTranslateError, 390 ["g\U00010000rk", 1, 2, "ouch"], 391 "can't translate character '\\U00010000' in position 1: ouch" 392 ) 393 self.check_exceptionobjectargs( 394 UnicodeTranslateError, 395 ["g\xfcrk", 1, 3, "ouch"], 396 "can't translate characters in position 1-2: ouch" 397 ) 398 399 def test_badandgoodstrictexceptions(self): 400 # "strict" complains about a non-exception passed in 401 self.assertRaises( 402 TypeError, 403 codecs.strict_errors, 404 42 405 ) 406 # "strict" complains about the wrong exception type 407 self.assertRaises( 408 Exception, 409 codecs.strict_errors, 410 Exception("ouch") 411 ) 412 413 # If the correct exception is passed in, "strict" raises it 414 self.assertRaises( 415 UnicodeEncodeError, 416 codecs.strict_errors, 417 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") 418 ) 419 self.assertRaises( 420 UnicodeDecodeError, 421 codecs.strict_errors, 422 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 423 ) 424 self.assertRaises( 425 UnicodeTranslateError, 426 codecs.strict_errors, 427 UnicodeTranslateError("\u3042", 0, 1, "ouch") 428 ) 429 430 def test_badandgoodignoreexceptions(self): 431 # "ignore" complains about a non-exception passed in 432 self.assertRaises( 433 TypeError, 434 codecs.ignore_errors, 435 42 436 ) 437 # "ignore" complains about the wrong exception type 438 self.assertRaises( 439 TypeError, 440 codecs.ignore_errors, 441 UnicodeError("ouch") 442 ) 443 # If the correct exception is passed in, "ignore" returns an empty replacement 444 self.assertEqual( 445 codecs.ignore_errors( 446 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 447 ("", 2) 448 ) 449 self.assertEqual( 450 codecs.ignore_errors( 451 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 452 ("", 2) 453 ) 454 self.assertEqual( 455 codecs.ignore_errors( 456 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 457 ("", 2) 458 ) 459 460 def test_badandgoodreplaceexceptions(self): 461 # "replace" complains about a non-exception passed in 462 self.assertRaises( 463 TypeError, 464 codecs.replace_errors, 465 42 466 ) 467 # "replace" complains about the wrong exception type 468 self.assertRaises( 469 TypeError, 470 codecs.replace_errors, 471 UnicodeError("ouch") 472 ) 473 self.assertRaises( 474 TypeError, 475 codecs.replace_errors, 476 BadObjectUnicodeEncodeError() 477 ) 478 self.assertRaises( 479 TypeError, 480 codecs.replace_errors, 481 BadObjectUnicodeDecodeError() 482 ) 483 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement 484 self.assertEqual( 485 codecs.replace_errors( 486 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 487 ("?", 2) 488 ) 489 self.assertEqual( 490 codecs.replace_errors( 491 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 492 ("\ufffd", 2) 493 ) 494 self.assertEqual( 495 codecs.replace_errors( 496 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 497 ("\ufffd", 2) 498 ) 499 500 def test_badandgoodxmlcharrefreplaceexceptions(self): 501 # "xmlcharrefreplace" complains about a non-exception passed in 502 self.assertRaises( 503 TypeError, 504 codecs.xmlcharrefreplace_errors, 505 42 506 ) 507 # "xmlcharrefreplace" complains about the wrong exception types 508 self.assertRaises( 509 TypeError, 510 codecs.xmlcharrefreplace_errors, 511 UnicodeError("ouch") 512 ) 513 # "xmlcharrefreplace" can only be used for encoding 514 self.assertRaises( 515 TypeError, 516 codecs.xmlcharrefreplace_errors, 517 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 518 ) 519 self.assertRaises( 520 TypeError, 521 codecs.xmlcharrefreplace_errors, 522 UnicodeTranslateError("\u3042", 0, 1, "ouch") 523 ) 524 # Use the correct exception 525 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, 526 999999, 1000000) 527 cs += (0xd800, 0xdfff) 528 s = "".join(chr(c) for c in cs) 529 self.assertEqual( 530 codecs.xmlcharrefreplace_errors( 531 UnicodeEncodeError("ascii", "a" + s + "b", 532 1, 1 + len(s), "ouch") 533 ), 534 ("".join("&#%d;" % c for c in cs), 1 + len(s)) 535 ) 536 537 def test_badandgoodbackslashreplaceexceptions(self): 538 # "backslashreplace" complains about a non-exception passed in 539 self.assertRaises( 540 TypeError, 541 codecs.backslashreplace_errors, 542 42 543 ) 544 # "backslashreplace" complains about the wrong exception types 545 self.assertRaises( 546 TypeError, 547 codecs.backslashreplace_errors, 548 UnicodeError("ouch") 549 ) 550 # Use the correct exception 551 tests = [ 552 ("\u3042", "\\u3042"), 553 ("\n", "\\x0a"), 554 ("a", "\\x61"), 555 ("\x00", "\\x00"), 556 ("\xff", "\\xff"), 557 ("\u0100", "\\u0100"), 558 ("\uffff", "\\uffff"), 559 ("\U00010000", "\\U00010000"), 560 ("\U0010ffff", "\\U0010ffff"), 561 # Lone surrogates 562 ("\ud800", "\\ud800"), 563 ("\udfff", "\\udfff"), 564 ("\ud800\udfff", "\\ud800\\udfff"), 565 ] 566 for s, r in tests: 567 with self.subTest(str=s): 568 self.assertEqual( 569 codecs.backslashreplace_errors( 570 UnicodeEncodeError("ascii", "a" + s + "b", 571 1, 1 + len(s), "ouch")), 572 (r, 1 + len(s)) 573 ) 574 self.assertEqual( 575 codecs.backslashreplace_errors( 576 UnicodeTranslateError("a" + s + "b", 577 1, 1 + len(s), "ouch")), 578 (r, 1 + len(s)) 579 ) 580 tests = [ 581 (b"a", "\\x61"), 582 (b"\n", "\\x0a"), 583 (b"\x00", "\\x00"), 584 (b"\xff", "\\xff"), 585 ] 586 for b, r in tests: 587 with self.subTest(bytes=b): 588 self.assertEqual( 589 codecs.backslashreplace_errors( 590 UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"), 591 1, 2, "ouch")), 592 (r, 2) 593 ) 594 595 def test_badandgoodnamereplaceexceptions(self): 596 # "namereplace" complains about a non-exception passed in 597 self.assertRaises( 598 TypeError, 599 codecs.namereplace_errors, 600 42 601 ) 602 # "namereplace" complains about the wrong exception types 603 self.assertRaises( 604 TypeError, 605 codecs.namereplace_errors, 606 UnicodeError("ouch") 607 ) 608 # "namereplace" can only be used for encoding 609 self.assertRaises( 610 TypeError, 611 codecs.namereplace_errors, 612 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 613 ) 614 self.assertRaises( 615 TypeError, 616 codecs.namereplace_errors, 617 UnicodeTranslateError("\u3042", 0, 1, "ouch") 618 ) 619 # Use the correct exception 620 tests = [ 621 ("\u3042", "\\N{HIRAGANA LETTER A}"), 622 ("\x00", "\\x00"), 623 ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH " 624 "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"), 625 ("\U000e007f", "\\N{CANCEL TAG}"), 626 ("\U0010ffff", "\\U0010ffff"), 627 # Lone surrogates 628 ("\ud800", "\\ud800"), 629 ("\udfff", "\\udfff"), 630 ("\ud800\udfff", "\\ud800\\udfff"), 631 ] 632 for s, r in tests: 633 with self.subTest(str=s): 634 self.assertEqual( 635 codecs.namereplace_errors( 636 UnicodeEncodeError("ascii", "a" + s + "b", 637 1, 1 + len(s), "ouch")), 638 (r, 1 + len(s)) 639 ) 640 641 def test_badandgoodsurrogateescapeexceptions(self): 642 surrogateescape_errors = codecs.lookup_error('surrogateescape') 643 # "surrogateescape" complains about a non-exception passed in 644 self.assertRaises( 645 TypeError, 646 surrogateescape_errors, 647 42 648 ) 649 # "surrogateescape" complains about the wrong exception types 650 self.assertRaises( 651 TypeError, 652 surrogateescape_errors, 653 UnicodeError("ouch") 654 ) 655 # "surrogateescape" can not be used for translating 656 self.assertRaises( 657 TypeError, 658 surrogateescape_errors, 659 UnicodeTranslateError("\udc80", 0, 1, "ouch") 660 ) 661 # Use the correct exception 662 for s in ("a", "\udc7f", "\udd00"): 663 with self.subTest(str=s): 664 self.assertRaises( 665 UnicodeEncodeError, 666 surrogateescape_errors, 667 UnicodeEncodeError("ascii", s, 0, 1, "ouch") 668 ) 669 self.assertEqual( 670 surrogateescape_errors( 671 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")), 672 (b"\x80", 2) 673 ) 674 self.assertRaises( 675 UnicodeDecodeError, 676 surrogateescape_errors, 677 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch") 678 ) 679 self.assertEqual( 680 surrogateescape_errors( 681 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")), 682 ("\udc80", 2) 683 ) 684 685 def test_badandgoodsurrogatepassexceptions(self): 686 surrogatepass_errors = codecs.lookup_error('surrogatepass') 687 # "surrogatepass" complains about a non-exception passed in 688 self.assertRaises( 689 TypeError, 690 surrogatepass_errors, 691 42 692 ) 693 # "surrogatepass" complains about the wrong exception types 694 self.assertRaises( 695 TypeError, 696 surrogatepass_errors, 697 UnicodeError("ouch") 698 ) 699 # "surrogatepass" can not be used for translating 700 self.assertRaises( 701 TypeError, 702 surrogatepass_errors, 703 UnicodeTranslateError("\ud800", 0, 1, "ouch") 704 ) 705 # Use the correct exception 706 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): 707 with self.subTest(encoding=enc): 708 self.assertRaises( 709 UnicodeEncodeError, 710 surrogatepass_errors, 711 UnicodeEncodeError(enc, "a", 0, 1, "ouch") 712 ) 713 self.assertRaises( 714 UnicodeDecodeError, 715 surrogatepass_errors, 716 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") 717 ) 718 for s in ("\ud800", "\udfff", "\ud800\udfff"): 719 with self.subTest(str=s): 720 self.assertRaises( 721 UnicodeEncodeError, 722 surrogatepass_errors, 723 UnicodeEncodeError("ascii", s, 0, len(s), "ouch") 724 ) 725 tests = [ 726 ("utf-8", "\ud800", b'\xed\xa0\x80', 3), 727 ("utf-16le", "\ud800", b'\x00\xd8', 2), 728 ("utf-16be", "\ud800", b'\xd8\x00', 2), 729 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4), 730 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4), 731 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3), 732 ("utf-16le", "\udfff", b'\xff\xdf', 2), 733 ("utf-16be", "\udfff", b'\xdf\xff', 2), 734 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4), 735 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4), 736 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), 737 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2), 738 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2), 739 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4), 740 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4), 741 ] 742 for enc, s, b, n in tests: 743 with self.subTest(encoding=enc, str=s, bytes=b): 744 self.assertEqual( 745 surrogatepass_errors( 746 UnicodeEncodeError(enc, "a" + s + "b", 747 1, 1 + len(s), "ouch")), 748 (b, 1 + len(s)) 749 ) 750 self.assertEqual( 751 surrogatepass_errors( 752 UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"), 753 1, 1 + n, "ouch")), 754 (s[:1], 1 + n) 755 ) 756 757 def test_badhandlerresults(self): 758 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) 759 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") 760 761 for res in results: 762 codecs.register_error("test.badhandler", lambda x: res) 763 for enc in encs: 764 self.assertRaises( 765 TypeError, 766 "\u3042".encode, 767 enc, 768 "test.badhandler" 769 ) 770 for (enc, bytes) in ( 771 ("ascii", b"\xff"), 772 ("utf-8", b"\xff"), 773 ("utf-7", b"+x-"), 774 ): 775 self.assertRaises( 776 TypeError, 777 bytes.decode, 778 enc, 779 "test.badhandler" 780 ) 781 782 def test_lookup(self): 783 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 784 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) 785 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 786 self.assertEqual( 787 codecs.xmlcharrefreplace_errors, 788 codecs.lookup_error("xmlcharrefreplace") 789 ) 790 self.assertEqual( 791 codecs.backslashreplace_errors, 792 codecs.lookup_error("backslashreplace") 793 ) 794 self.assertEqual( 795 codecs.namereplace_errors, 796 codecs.lookup_error("namereplace") 797 ) 798 799 def test_encode_nonascii_replacement(self): 800 def handle(exc): 801 if isinstance(exc, UnicodeEncodeError): 802 return (repl, exc.end) 803 raise TypeError("don't know how to handle %r" % exc) 804 codecs.register_error("test.replacing", handle) 805 806 for enc, input, repl in ( 807 ("ascii", "[¤]", "abc"), 808 ("iso-8859-1", "[€]", "½¾"), 809 ("iso-8859-15", "[¤]", "œŸ"), 810 ): 811 res = input.encode(enc, "test.replacing") 812 self.assertEqual(res, ("[" + repl + "]").encode(enc)) 813 814 for enc, input, repl in ( 815 ("utf-8", "[\udc80]", "\U0001f40d"), 816 ("utf-16", "[\udc80]", "\U0001f40d"), 817 ("utf-32", "[\udc80]", "\U0001f40d"), 818 ): 819 with self.subTest(encoding=enc): 820 with self.assertRaises(UnicodeEncodeError) as cm: 821 input.encode(enc, "test.replacing") 822 exc = cm.exception 823 self.assertEqual(exc.start, 1) 824 self.assertEqual(exc.end, 2) 825 self.assertEqual(exc.object, input) 826 827 def test_encode_unencodable_replacement(self): 828 def unencrepl(exc): 829 if isinstance(exc, UnicodeEncodeError): 830 return (repl, exc.end) 831 else: 832 raise TypeError("don't know how to handle %r" % exc) 833 codecs.register_error("test.unencreplhandler", unencrepl) 834 835 for enc, input, repl in ( 836 ("ascii", "[¤]", "½"), 837 ("iso-8859-1", "[€]", "œ"), 838 ("iso-8859-15", "[¤]", "½"), 839 ("utf-8", "[\udc80]", "\udcff"), 840 ("utf-16", "[\udc80]", "\udcff"), 841 ("utf-32", "[\udc80]", "\udcff"), 842 ): 843 with self.subTest(encoding=enc): 844 with self.assertRaises(UnicodeEncodeError) as cm: 845 input.encode(enc, "test.unencreplhandler") 846 exc = cm.exception 847 self.assertEqual(exc.start, 1) 848 self.assertEqual(exc.end, 2) 849 self.assertEqual(exc.object, input) 850 851 def test_encode_bytes_replacement(self): 852 def handle(exc): 853 if isinstance(exc, UnicodeEncodeError): 854 return (repl, exc.end) 855 raise TypeError("don't know how to handle %r" % exc) 856 codecs.register_error("test.replacing", handle) 857 858 # It works even if the bytes sequence is not decodable. 859 for enc, input, repl in ( 860 ("ascii", "[¤]", b"\xbd\xbe"), 861 ("iso-8859-1", "[€]", b"\xbd\xbe"), 862 ("iso-8859-15", "[¤]", b"\xbd\xbe"), 863 ("utf-8", "[\udc80]", b"\xbd\xbe"), 864 ("utf-16le", "[\udc80]", b"\xbd\xbe"), 865 ("utf-16be", "[\udc80]", b"\xbd\xbe"), 866 ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), 867 ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), 868 ): 869 with self.subTest(encoding=enc): 870 res = input.encode(enc, "test.replacing") 871 self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) 872 873 def test_encode_odd_bytes_replacement(self): 874 def handle(exc): 875 if isinstance(exc, UnicodeEncodeError): 876 return (repl, exc.end) 877 raise TypeError("don't know how to handle %r" % exc) 878 codecs.register_error("test.replacing", handle) 879 880 input = "[\udc80]" 881 # Tests in which the replacement bytestring contains not whole number 882 # of code units. 883 for enc, repl in ( 884 *itertools.product(("utf-16le", "utf-16be"), 885 [b"a", b"abc"]), 886 *itertools.product(("utf-32le", "utf-32be"), 887 [b"a", b"ab", b"abc", b"abcde"]), 888 ): 889 with self.subTest(encoding=enc, repl=repl): 890 with self.assertRaises(UnicodeEncodeError) as cm: 891 input.encode(enc, "test.replacing") 892 exc = cm.exception 893 self.assertEqual(exc.start, 1) 894 self.assertEqual(exc.end, 2) 895 self.assertEqual(exc.object, input) 896 self.assertEqual(exc.reason, "surrogates not allowed") 897 898 def test_badregistercall(self): 899 # enhance coverage of: 900 # Modules/_codecsmodule.c::register_error() 901 # Python/codecs.c::PyCodec_RegisterError() 902 self.assertRaises(TypeError, codecs.register_error, 42) 903 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) 904 905 def test_badlookupcall(self): 906 # enhance coverage of: 907 # Modules/_codecsmodule.c::lookup_error() 908 self.assertRaises(TypeError, codecs.lookup_error) 909 910 def test_unknownhandler(self): 911 # enhance coverage of: 912 # Modules/_codecsmodule.c::lookup_error() 913 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") 914 915 def test_xmlcharrefvalues(self): 916 # enhance coverage of: 917 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() 918 # and inline implementations 919 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 920 500000, 1000000) 921 s = "".join([chr(x) for x in v]) 922 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) 923 for enc in ("ascii", "iso-8859-15"): 924 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): 925 s.encode(enc, err) 926 927 def test_decodehelper(self): 928 # enhance coverage of: 929 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() 930 # and callers 931 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown") 932 933 def baddecodereturn1(exc): 934 return 42 935 codecs.register_error("test.baddecodereturn1", baddecodereturn1) 936 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1") 937 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1") 938 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1") 939 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1") 940 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") 941 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") 942 943 def baddecodereturn2(exc): 944 return ("?", None) 945 codecs.register_error("test.baddecodereturn2", baddecodereturn2) 946 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2") 947 948 handler = PosReturn() 949 codecs.register_error("test.posreturn", handler.handle) 950 951 # Valid negative position 952 handler.pos = -1 953 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 954 955 # Valid negative position 956 handler.pos = -2 957 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>") 958 959 # Negative position out of bounds 960 handler.pos = -3 961 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 962 963 # Valid positive position 964 handler.pos = 1 965 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 966 967 # Largest valid positive position (one beyond end of input) 968 handler.pos = 2 969 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>") 970 971 # Invalid positive position 972 handler.pos = 3 973 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 974 975 # Restart at the "0" 976 handler.pos = 6 977 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0") 978 979 class D(dict): 980 def __getitem__(self, key): 981 raise ValueError 982 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None}) 983 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D()) 984 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1}) 985 986 def test_encodehelper(self): 987 # enhance coverage of: 988 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 989 # and callers 990 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown") 991 992 def badencodereturn1(exc): 993 return 42 994 codecs.register_error("test.badencodereturn1", badencodereturn1) 995 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1") 996 997 def badencodereturn2(exc): 998 return ("?", None) 999 codecs.register_error("test.badencodereturn2", badencodereturn2) 1000 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2") 1001 1002 handler = PosReturn() 1003 codecs.register_error("test.posreturn", handler.handle) 1004 1005 # Valid negative position 1006 handler.pos = -1 1007 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 1008 1009 # Valid negative position 1010 handler.pos = -2 1011 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>") 1012 1013 # Negative position out of bounds 1014 handler.pos = -3 1015 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 1016 1017 # Valid positive position 1018 handler.pos = 1 1019 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 1020 1021 # Largest valid positive position (one beyond end of input 1022 handler.pos = 2 1023 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>") 1024 1025 # Invalid positive position 1026 handler.pos = 3 1027 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 1028 1029 handler.pos = 0 1030 1031 class D(dict): 1032 def __getitem__(self, key): 1033 raise ValueError 1034 for err in ("strict", "replace", "xmlcharrefreplace", 1035 "backslashreplace", "namereplace", "test.posreturn"): 1036 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) 1037 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) 1038 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) 1039 1040 def test_decodehelper_bug36819(self): 1041 handler = RepeatedPosReturn("x") 1042 codecs.register_error("test.bug36819", handler.handle) 1043 1044 testcases = [ 1045 ("ascii", b"\xff"), 1046 ("utf-8", b"\xff"), 1047 ("utf-16be", b'\xdc\x80'), 1048 ("utf-32be", b'\x00\x00\xdc\x80'), 1049 ("iso-8859-6", b"\xff"), 1050 ] 1051 for enc, bad in testcases: 1052 input = "abcd".encode(enc) + bad 1053 with self.subTest(encoding=enc): 1054 handler.count = 50 1055 decoded = input.decode(enc, "test.bug36819") 1056 self.assertEqual(decoded, 'abcdx' * 51) 1057 1058 def test_encodehelper_bug36819(self): 1059 handler = RepeatedPosReturn() 1060 codecs.register_error("test.bug36819", handler.handle) 1061 1062 input = "abcd\udc80" 1063 encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in 1064 encodings += ["iso-8859-15"] # charmap codec 1065 if sys.platform == 'win32': 1066 encodings = ["mbcs", "oem"] # code page codecs 1067 1068 handler.repl = "\udcff" 1069 for enc in encodings: 1070 with self.subTest(encoding=enc): 1071 handler.count = 50 1072 with self.assertRaises(UnicodeEncodeError) as cm: 1073 input.encode(enc, "test.bug36819") 1074 exc = cm.exception 1075 self.assertEqual(exc.start, 4) 1076 self.assertEqual(exc.end, 5) 1077 self.assertEqual(exc.object, input) 1078 if sys.platform == "win32": 1079 handler.count = 50 1080 with self.assertRaises(UnicodeEncodeError) as cm: 1081 codecs.code_page_encode(437, input, "test.bug36819") 1082 exc = cm.exception 1083 self.assertEqual(exc.start, 4) 1084 self.assertEqual(exc.end, 5) 1085 self.assertEqual(exc.object, input) 1086 1087 handler.repl = "x" 1088 for enc in encodings: 1089 with self.subTest(encoding=enc): 1090 # The interpreter should segfault after a handful of attempts. 1091 # 50 was chosen to try to ensure a segfault without a fix, 1092 # but not OOM a machine with one. 1093 handler.count = 50 1094 encoded = input.encode(enc, "test.bug36819") 1095 self.assertEqual(encoded.decode(enc), "abcdx" * 51) 1096 if sys.platform == "win32": 1097 handler.count = 50 1098 encoded = codecs.code_page_encode(437, input, "test.bug36819") 1099 self.assertEqual(encoded[0].decode(), "abcdx" * 51) 1100 self.assertEqual(encoded[1], len(input)) 1101 1102 def test_translatehelper(self): 1103 # enhance coverage of: 1104 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 1105 # and callers 1106 # (Unfortunately the errors argument is not directly accessible 1107 # from Python, so we can't test that much) 1108 class D(dict): 1109 def __getitem__(self, key): 1110 raise ValueError 1111 #self.assertRaises(ValueError, "\xff".translate, D()) 1112 self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1}) 1113 self.assertRaises(TypeError, "\xff".translate, {0xff: ()}) 1114 1115 def test_bug828737(self): 1116 charmap = { 1117 ord("&"): "&", 1118 ord("<"): "<", 1119 ord(">"): ">", 1120 ord('"'): """, 1121 } 1122 1123 for n in (1, 10, 100, 1000): 1124 text = 'abc<def>ghi'*n 1125 text.translate(charmap) 1126 1127 def test_mutating_decode_handler(self): 1128 baddata = [ 1129 ("ascii", b"\xff"), 1130 ("utf-7", b"++"), 1131 ("utf-8", b"\xff"), 1132 ("utf-16", b"\xff"), 1133 ("utf-32", b"\xff"), 1134 ("unicode-escape", b"\\u123g"), 1135 ("raw-unicode-escape", b"\\u123g"), 1136 ] 1137 1138 def replacing(exc): 1139 if isinstance(exc, UnicodeDecodeError): 1140 exc.object = 42 1141 return ("\u4242", 0) 1142 else: 1143 raise TypeError("don't know how to handle %r" % exc) 1144 codecs.register_error("test.replacing", replacing) 1145 1146 for (encoding, data) in baddata: 1147 with self.assertRaises(TypeError): 1148 data.decode(encoding, "test.replacing") 1149 1150 def mutating(exc): 1151 if isinstance(exc, UnicodeDecodeError): 1152 exc.object = b"" 1153 return ("\u4242", 0) 1154 else: 1155 raise TypeError("don't know how to handle %r" % exc) 1156 codecs.register_error("test.mutating", mutating) 1157 # If the decoder doesn't pick up the modified input the following 1158 # will lead to an endless loop 1159 for (encoding, data) in baddata: 1160 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") 1161 1162 def test_mutating_decode_handler_unicode_escape(self): 1163 decode = codecs.unicode_escape_decode 1164 def mutating(exc): 1165 if isinstance(exc, UnicodeDecodeError): 1166 r = data.get(exc.object[:exc.end]) 1167 if r is not None: 1168 exc.object = r[0] + exc.object[exc.end:] 1169 return ('\u0404', r[1]) 1170 raise AssertionError("don't know how to handle %r" % exc) 1171 1172 codecs.register_error('test.mutating2', mutating) 1173 data = { 1174 br'\x0': (b'\\', 0), 1175 br'\x3': (b'xxx\\', 3), 1176 br'\x5': (b'x\\', 1), 1177 } 1178 def check(input, expected, msg): 1179 with self.assertWarns(DeprecationWarning) as cm: 1180 self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input))) 1181 self.assertIn(msg, str(cm.warning)) 1182 1183 check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'") 1184 check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'") 1185 check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'") 1186 1187 check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'") 1188 check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'") 1189 check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'") 1190 check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'") 1191 check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'") 1192 1193 check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'") 1194 check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'") 1195 check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'") 1196 check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'") 1197 1198 # issue32583 1199 def test_crashing_decode_handler(self): 1200 # better generating one more character to fill the extra space slot 1201 # so in debug build it can steadily fail 1202 def forward_shorter_than_end(exc): 1203 if isinstance(exc, UnicodeDecodeError): 1204 # size one character, 0 < forward < exc.end 1205 return ('\ufffd', exc.start+1) 1206 else: 1207 raise TypeError("don't know how to handle %r" % exc) 1208 codecs.register_error( 1209 "test.forward_shorter_than_end", forward_shorter_than_end) 1210 1211 self.assertEqual( 1212 b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode( 1213 'utf-16-le', 'test.forward_shorter_than_end'), 1214 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1215 ) 1216 self.assertEqual( 1217 b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode( 1218 'utf-16-be', 'test.forward_shorter_than_end'), 1219 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1220 ) 1221 self.assertEqual( 1222 b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode( 1223 'utf-32-le', 'test.forward_shorter_than_end'), 1224 '\ufffd\ufffd\ufffd\u1111\x00' 1225 ) 1226 self.assertEqual( 1227 b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode( 1228 'utf-32-be', 'test.forward_shorter_than_end'), 1229 '\ufffd\ufffd\ufffd\u1111\x00' 1230 ) 1231 1232 def replace_with_long(exc): 1233 if isinstance(exc, UnicodeDecodeError): 1234 exc.object = b"\x00" * 8 1235 return ('\ufffd', exc.start) 1236 else: 1237 raise TypeError("don't know how to handle %r" % exc) 1238 codecs.register_error("test.replace_with_long", replace_with_long) 1239 1240 self.assertEqual( 1241 b'\x00'.decode('utf-16', 'test.replace_with_long'), 1242 '\ufffd\x00\x00\x00\x00' 1243 ) 1244 self.assertEqual( 1245 b'\x00'.decode('utf-32', 'test.replace_with_long'), 1246 '\ufffd\x00\x00' 1247 ) 1248 1249 1250 def test_fake_error_class(self): 1251 handlers = [ 1252 codecs.strict_errors, 1253 codecs.ignore_errors, 1254 codecs.replace_errors, 1255 codecs.backslashreplace_errors, 1256 codecs.namereplace_errors, 1257 codecs.xmlcharrefreplace_errors, 1258 codecs.lookup_error('surrogateescape'), 1259 codecs.lookup_error('surrogatepass'), 1260 ] 1261 for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError: 1262 class FakeUnicodeError(str): 1263 __class__ = cls 1264 for handler in handlers: 1265 with self.subTest(handler=handler, error_class=cls): 1266 self.assertRaises(TypeError, handler, FakeUnicodeError()) 1267 class FakeUnicodeError(Exception): 1268 __class__ = cls 1269 for handler in handlers: 1270 with self.subTest(handler=handler, error_class=cls): 1271 with self.assertRaises((TypeError, FakeUnicodeError)): 1272 handler(FakeUnicodeError()) 1273 1274 1275if __name__ == "__main__": 1276 unittest.main() 1277