1import test.test_support, unittest 2import sys, codecs, htmlentitydefs, unicodedata 3 4class PosReturn: 5 # this can be used for configurable callbacks 6 7 def __init__(self): 8 self.pos = 0 9 10 def handle(self, exc): 11 oldpos = self.pos 12 realpos = oldpos 13 if realpos<0: 14 realpos = len(exc.object) + realpos 15 # if we don't advance this time, terminate on the next call 16 # otherwise we'd get an endless loop 17 if realpos <= exc.start: 18 self.pos = len(exc.object) 19 return (u"<?>", oldpos) 20 21# A UnicodeEncodeError object with a bad start attribute 22class BadStartUnicodeEncodeError(UnicodeEncodeError): 23 def __init__(self): 24 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") 25 self.start = [] 26 27# A UnicodeEncodeError object with a bad object attribute 28class BadObjectUnicodeEncodeError(UnicodeEncodeError): 29 def __init__(self): 30 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") 31 self.object = [] 32 33# A UnicodeDecodeError object without an end attribute 34class NoEndUnicodeDecodeError(UnicodeDecodeError): 35 def __init__(self): 36 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") 37 del self.end 38 39# A UnicodeDecodeError object with a bad object attribute 40class BadObjectUnicodeDecodeError(UnicodeDecodeError): 41 def __init__(self): 42 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") 43 self.object = [] 44 45# A UnicodeTranslateError object without a start attribute 46class NoStartUnicodeTranslateError(UnicodeTranslateError): 47 def __init__(self): 48 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") 49 del self.start 50 51# A UnicodeTranslateError object without an end attribute 52class NoEndUnicodeTranslateError(UnicodeTranslateError): 53 def __init__(self): 54 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") 55 del self.end 56 57# A UnicodeTranslateError object without an object attribute 58class NoObjectUnicodeTranslateError(UnicodeTranslateError): 59 def __init__(self): 60 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") 61 del self.object 62 63class CodecCallbackTest(unittest.TestCase): 64 65 def test_xmlcharrefreplace(self): 66 # replace unencodable characters which numeric character entities. 67 # For ascii, latin-1 and charmaps this is completely implemented 68 # in C and should be reasonably fast. 69 s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161" 70 self.assertEqual( 71 s.encode("ascii", "xmlcharrefreplace"), 72 "スパモ änd eggš" 73 ) 74 self.assertEqual( 75 s.encode("latin-1", "xmlcharrefreplace"), 76 "スパモ \xe4nd eggš" 77 ) 78 self.assertEqual( 79 s.encode("iso-8859-15", "xmlcharrefreplace"), 80 "スパモ \xe4nd egg\xa8" 81 ) 82 83 def test_xmlcharrefreplace_with_surrogates(self): 84 tests = [(u'\U0001f49d', '💝'), 85 (u'\ud83d', '�'), 86 (u'\udc9d', '�'), 87 ] 88 if u'\ud83d\udc9d' != u'\U0001f49d': 89 tests += [(u'\ud83d\udc9d', '��')] 90 for encoding in ['ascii', 'latin1', 'iso-8859-15']: 91 for s, exp in tests: 92 self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'), 93 exp, msg='%r.encode(%r)' % (s, encoding)) 94 self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'), 95 exp+'X', 96 msg='%r.encode(%r)' % (s + 'X', encoding)) 97 98 def test_xmlcharnamereplace(self): 99 # This time use a named character entity for unencodable 100 # characters, if one is available. 101 102 def xmlcharnamereplace(exc): 103 if not isinstance(exc, UnicodeEncodeError): 104 raise TypeError("don't know how to handle %r" % exc) 105 l = [] 106 for c in exc.object[exc.start:exc.end]: 107 try: 108 l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)]) 109 except KeyError: 110 l.append(u"&#%d;" % ord(c)) 111 return (u"".join(l), exc.end) 112 113 codecs.register_error( 114 "test.xmlcharnamereplace", xmlcharnamereplace) 115 116 sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" 117 sout = "«ℜ» = ⟨ሴ€⟩" 118 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) 119 sout = "\xabℜ\xbb = ⟨ሴ€⟩" 120 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) 121 sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" 122 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) 123 124 def test_uninamereplace(self): 125 # We're using the names from the unicode database this time, 126 # and we're doing "syntax highlighting" here, i.e. we include 127 # the replaced text in ANSI escape sequences. For this it is 128 # useful that the error handler is not called for every single 129 # unencodable character, but for a complete sequence of 130 # unencodable characters, otherwise we would output many 131 # unnecessary escape sequences. 132 133 def uninamereplace(exc): 134 if not isinstance(exc, UnicodeEncodeError): 135 raise TypeError("don't know how to handle %r" % exc) 136 l = [] 137 for c in exc.object[exc.start:exc.end]: 138 l.append(unicodedata.name(c, u"0x%x" % ord(c))) 139 return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) 140 141 codecs.register_error( 142 "test.uninamereplace", uninamereplace) 143 144 sin = u"\xac\u1234\u20ac\u8000" 145 sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 146 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) 147 148 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 149 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) 150 151 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" 152 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) 153 154 def test_backslashescape(self): 155 # Does the same as the "unicode-escape" encoding, but with different 156 # base encodings. 157 sin = u"a\xac\u1234\u20ac\u8000" 158 if sys.maxunicode > 0xffff: 159 sin += unichr(sys.maxunicode) 160 sout = "a\\xac\\u1234\\u20ac\\u8000" 161 if sys.maxunicode > 0xffff: 162 sout += "\\U%08x" % sys.maxunicode 163 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) 164 165 sout = "a\xac\\u1234\\u20ac\\u8000" 166 if sys.maxunicode > 0xffff: 167 sout += "\\U%08x" % sys.maxunicode 168 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) 169 170 sout = "a\xac\\u1234\xa4\\u8000" 171 if sys.maxunicode > 0xffff: 172 sout += "\\U%08x" % sys.maxunicode 173 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) 174 175 def test_decoding_callbacks(self): 176 # This is a test for a decoding callback handler 177 # that allows the decoding of the invalid sequence 178 # "\xc0\x80" and returns "\x00" instead of raising an error. 179 # All other illegal sequences will be handled strictly. 180 def relaxedutf8(exc): 181 if not isinstance(exc, UnicodeDecodeError): 182 raise TypeError("don't know how to handle %r" % exc) 183 if exc.object[exc.start:exc.start+2] == "\xc0\x80": 184 return (u"\x00", exc.start+2) # retry after two bytes 185 else: 186 raise exc 187 188 codecs.register_error("test.relaxedutf8", relaxedutf8) 189 190 # all the "\xc0\x80" will be decoded to "\x00" 191 sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" 192 sout = u"a\x00b\x00c\xfc\x00\x00" 193 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) 194 195 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised 196 sin = "\xc0\x80\xc0\x81" 197 self.assertRaises(UnicodeDecodeError, sin.decode, 198 "utf-8", "test.relaxedutf8") 199 200 def test_charmapencode(self): 201 # For charmap encodings the replacement string will be 202 # mapped through the encoding again. This means, that 203 # to be able to use e.g. the "replace" handler, the 204 # charmap has to have a mapping for "?". 205 charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) 206 sin = u"abc" 207 sout = "AABBCC" 208 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout) 209 210 sin = u"abcA" 211 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) 212 213 charmap[ord("?")] = "XYZ" 214 sin = u"abcDEF" 215 sout = "AABBCCXYZXYZXYZ" 216 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout) 217 218 charmap[ord("?")] = u"XYZ" 219 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) 220 221 charmap[ord("?")] = u"XYZ" 222 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) 223 224 def test_decodeunicodeinternal(self): 225 self.assertRaises( 226 UnicodeDecodeError, 227 "\x00\x00\x00\x00\x00".decode, 228 "unicode-internal", 229 ) 230 if sys.maxunicode > 0xffff: 231 def handler_unicodeinternal(exc): 232 if not isinstance(exc, UnicodeDecodeError): 233 raise TypeError("don't know how to handle %r" % exc) 234 return (u"\x01", 1) 235 236 self.assertEqual( 237 "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), 238 u"\u0000" 239 ) 240 241 self.assertEqual( 242 "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), 243 u"\u0000\ufffd" 244 ) 245 246 codecs.register_error("test.hui", handler_unicodeinternal) 247 248 self.assertEqual( 249 "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), 250 u"\u0000\u0001\u0000" 251 ) 252 253 def test_callbacks(self): 254 def handler1(exc): 255 if not isinstance(exc, UnicodeEncodeError) \ 256 and not isinstance(exc, UnicodeDecodeError): 257 raise TypeError("don't know how to handle %r" % exc) 258 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] 259 return (u"[%s]" % u"".join(l), exc.end) 260 261 codecs.register_error("test.handler1", handler1) 262 263 def handler2(exc): 264 if not isinstance(exc, UnicodeDecodeError): 265 raise TypeError("don't know how to handle %r" % exc) 266 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] 267 return (u"[%s]" % u"".join(l), exc.end+1) # skip one character 268 269 codecs.register_error("test.handler2", handler2) 270 271 s = "\x00\x81\x7f\x80\xff" 272 273 self.assertEqual( 274 s.decode("ascii", "test.handler1"), 275 u"\x00[<129>]\x7f[<128>][<255>]" 276 ) 277 self.assertEqual( 278 s.decode("ascii", "test.handler2"), 279 u"\x00[<129>][<128>]" 280 ) 281 282 self.assertEqual( 283 "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), 284 u"\u3042[<92><117><51>]xxx" 285 ) 286 287 self.assertEqual( 288 "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), 289 u"\u3042[<92><117><51>]xx" 290 ) 291 292 self.assertEqual( 293 codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], 294 u"z[<98>][<99>]" 295 ) 296 297 self.assertEqual( 298 u"g\xfc\xdfrk".encode("ascii", "test.handler1"), 299 u"g[<252><223>]rk" 300 ) 301 302 self.assertEqual( 303 u"g\xfc\xdf".encode("ascii", "test.handler1"), 304 u"g[<252><223>]" 305 ) 306 307 def test_longstrings(self): 308 # test long strings to check for memory overflow problems 309 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", 310 "backslashreplace"] 311 # register the handlers under different names, 312 # to prevent the codec from recognizing the name 313 for err in errors: 314 codecs.register_error("test." + err, codecs.lookup_error(err)) 315 l = 1000 316 errors += [ "test." + err for err in errors ] 317 for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: 318 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", 319 "utf-8", "utf-7", "utf-16", "utf-32"): 320 for err in errors: 321 try: 322 uni.encode(enc, err) 323 except UnicodeError: 324 pass 325 326 def check_exceptionobjectargs(self, exctype, args, msg): 327 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion 328 # check with one missing argument 329 self.assertRaises(TypeError, exctype, *args[:-1]) 330 # check with one argument too much 331 self.assertRaises(TypeError, exctype, *(args + ["too much"])) 332 # check with one argument of the wrong type 333 wrongargs = [ "spam", u"eggs", 42, 1.0, None ] 334 for i in xrange(len(args)): 335 for wrongarg in wrongargs: 336 if type(wrongarg) is type(args[i]): 337 continue 338 # build argument array 339 callargs = [] 340 for j in xrange(len(args)): 341 if i==j: 342 callargs.append(wrongarg) 343 else: 344 callargs.append(args[i]) 345 self.assertRaises(TypeError, exctype, *callargs) 346 347 # check with the correct number and type of arguments 348 exc = exctype(*args) 349 self.assertEqual(str(exc), msg) 350 351 def test_unicodeencodeerror(self): 352 self.check_exceptionobjectargs( 353 UnicodeEncodeError, 354 ["ascii", u"g\xfcrk", 1, 2, "ouch"], 355 "'ascii' codec can't encode character u'\\xfc' in position 1: ouch" 356 ) 357 self.check_exceptionobjectargs( 358 UnicodeEncodeError, 359 ["ascii", u"g\xfcrk", 1, 4, "ouch"], 360 "'ascii' codec can't encode characters in position 1-3: ouch" 361 ) 362 self.check_exceptionobjectargs( 363 UnicodeEncodeError, 364 ["ascii", u"\xfcx", 0, 1, "ouch"], 365 "'ascii' codec can't encode character u'\\xfc' in position 0: ouch" 366 ) 367 self.check_exceptionobjectargs( 368 UnicodeEncodeError, 369 ["ascii", u"\u0100x", 0, 1, "ouch"], 370 "'ascii' codec can't encode character u'\\u0100' in position 0: ouch" 371 ) 372 self.check_exceptionobjectargs( 373 UnicodeEncodeError, 374 ["ascii", u"\uffffx", 0, 1, "ouch"], 375 "'ascii' codec can't encode character u'\\uffff' in position 0: ouch" 376 ) 377 if sys.maxunicode > 0xffff: 378 self.check_exceptionobjectargs( 379 UnicodeEncodeError, 380 ["ascii", u"\U00010000x", 0, 1, "ouch"], 381 "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" 382 ) 383 384 def test_unicodedecodeerror(self): 385 self.check_exceptionobjectargs( 386 UnicodeDecodeError, 387 ["ascii", "g\xfcrk", 1, 2, "ouch"], 388 "'ascii' codec can't decode byte 0xfc in position 1: ouch" 389 ) 390 self.check_exceptionobjectargs( 391 UnicodeDecodeError, 392 ["ascii", "g\xfcrk", 1, 3, "ouch"], 393 "'ascii' codec can't decode bytes in position 1-2: ouch" 394 ) 395 396 def test_unicodetranslateerror(self): 397 self.check_exceptionobjectargs( 398 UnicodeTranslateError, 399 [u"g\xfcrk", 1, 2, "ouch"], 400 "can't translate character u'\\xfc' in position 1: ouch" 401 ) 402 self.check_exceptionobjectargs( 403 UnicodeTranslateError, 404 [u"g\u0100rk", 1, 2, "ouch"], 405 "can't translate character u'\\u0100' in position 1: ouch" 406 ) 407 self.check_exceptionobjectargs( 408 UnicodeTranslateError, 409 [u"g\uffffrk", 1, 2, "ouch"], 410 "can't translate character u'\\uffff' in position 1: ouch" 411 ) 412 if sys.maxunicode > 0xffff: 413 self.check_exceptionobjectargs( 414 UnicodeTranslateError, 415 [u"g\U00010000rk", 1, 2, "ouch"], 416 "can't translate character u'\\U00010000' in position 1: ouch" 417 ) 418 self.check_exceptionobjectargs( 419 UnicodeTranslateError, 420 [u"g\xfcrk", 1, 3, "ouch"], 421 "can't translate characters in position 1-2: ouch" 422 ) 423 424 def test_badandgoodstrictexceptions(self): 425 # "strict" complains about a non-exception passed in 426 self.assertRaises( 427 TypeError, 428 codecs.strict_errors, 429 42 430 ) 431 # "strict" complains about the wrong exception type 432 self.assertRaises( 433 Exception, 434 codecs.strict_errors, 435 Exception("ouch") 436 ) 437 438 # If the correct exception is passed in, "strict" raises it 439 self.assertRaises( 440 UnicodeEncodeError, 441 codecs.strict_errors, 442 UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") 443 ) 444 self.assertRaises( 445 UnicodeDecodeError, 446 codecs.strict_errors, 447 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") 448 ) 449 self.assertRaises( 450 UnicodeTranslateError, 451 codecs.strict_errors, 452 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") 453 ) 454 455 def test_badandgoodignoreexceptions(self): 456 # "ignore" complains about a non-exception passed in 457 self.assertRaises( 458 TypeError, 459 codecs.ignore_errors, 460 42 461 ) 462 # "ignore" complains about the wrong exception type 463 self.assertRaises( 464 TypeError, 465 codecs.ignore_errors, 466 UnicodeError("ouch") 467 ) 468 # If the correct exception is passed in, "ignore" returns an empty replacement 469 self.assertEqual( 470 codecs.ignore_errors( 471 UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")), 472 (u"", 2) 473 ) 474 self.assertEqual( 475 codecs.ignore_errors( 476 UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")), 477 (u"", 2) 478 ) 479 self.assertEqual( 480 codecs.ignore_errors( 481 UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")), 482 (u"", 2) 483 ) 484 485 def test_badandgoodreplaceexceptions(self): 486 # "replace" complains about a non-exception passed in 487 self.assertRaises( 488 TypeError, 489 codecs.replace_errors, 490 42 491 ) 492 # "replace" complains about the wrong exception type 493 self.assertRaises( 494 TypeError, 495 codecs.replace_errors, 496 UnicodeError("ouch") 497 ) 498 self.assertRaises( 499 TypeError, 500 codecs.replace_errors, 501 BadObjectUnicodeEncodeError() 502 ) 503 self.assertRaises( 504 TypeError, 505 codecs.replace_errors, 506 BadObjectUnicodeDecodeError() 507 ) 508 # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement 509 self.assertEqual( 510 codecs.replace_errors( 511 UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")), 512 (u"?", 2) 513 ) 514 self.assertEqual( 515 codecs.replace_errors( 516 UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")), 517 (u"\ufffd", 2) 518 ) 519 self.assertEqual( 520 codecs.replace_errors( 521 UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")), 522 (u"\ufffd", 2) 523 ) 524 525 def test_badandgoodxmlcharrefreplaceexceptions(self): 526 # "xmlcharrefreplace" complains about a non-exception passed in 527 self.assertRaises( 528 TypeError, 529 codecs.xmlcharrefreplace_errors, 530 42 531 ) 532 # "xmlcharrefreplace" complains about the wrong exception types 533 self.assertRaises( 534 TypeError, 535 codecs.xmlcharrefreplace_errors, 536 UnicodeError("ouch") 537 ) 538 # "xmlcharrefreplace" can only be used for encoding 539 self.assertRaises( 540 TypeError, 541 codecs.xmlcharrefreplace_errors, 542 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") 543 ) 544 self.assertRaises( 545 TypeError, 546 codecs.xmlcharrefreplace_errors, 547 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") 548 ) 549 # Use the correct exception 550 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000) 551 cs += (0xdfff, 0xd800) 552 s = u"".join(unichr(c) for c in cs) 553 s += u"\U0001869f\U000186a0\U000f423f\U000f4240" 554 cs += (99999, 100000, 999999, 1000000) 555 self.assertEqual( 556 codecs.xmlcharrefreplace_errors( 557 UnicodeEncodeError("ascii", u"a" + s + u"b", 558 1, 1 + len(s), "ouch") 559 ), 560 (u"".join(u"&#%d;" % c for c in cs), 1 + len(s)) 561 ) 562 563 def test_badandgoodbackslashreplaceexceptions(self): 564 # "backslashreplace" complains about a non-exception passed in 565 self.assertRaises( 566 TypeError, 567 codecs.backslashreplace_errors, 568 42 569 ) 570 # "backslashreplace" complains about the wrong exception types 571 self.assertRaises( 572 TypeError, 573 codecs.backslashreplace_errors, 574 UnicodeError("ouch") 575 ) 576 # "backslashreplace" can only be used for encoding 577 self.assertRaises( 578 TypeError, 579 codecs.backslashreplace_errors, 580 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") 581 ) 582 self.assertRaises( 583 TypeError, 584 codecs.backslashreplace_errors, 585 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") 586 ) 587 # Use the correct exception 588 tests = [ 589 (u"\u3042", u"\\u3042"), 590 (u"\n", u"\\x0a"), 591 (u"a", u"\\x61"), 592 (u"\x00", u"\\x00"), 593 (u"\xff", u"\\xff"), 594 (u"\u0100", u"\\u0100"), 595 (u"\uffff", u"\\uffff"), 596 # Lone surrogates 597 (u"\ud800", u"\\ud800"), 598 (u"\udfff", u"\\udfff"), 599 ] 600 if sys.maxunicode > 0xffff: 601 tests += [ 602 (u"\U00010000", u"\\U00010000"), 603 (u"\U0010ffff", u"\\U0010ffff"), 604 ] 605 else: 606 tests += [ 607 (u"\U00010000", u"\\ud800\\udc00"), 608 (u"\U0010ffff", u"\\udbff\\udfff"), 609 ] 610 for s, r in tests: 611 self.assertEqual( 612 codecs.backslashreplace_errors( 613 UnicodeEncodeError("ascii", u"a" + s + u"b", 614 1, 1 + len(s), "ouch")), 615 (r, 1 + len(s)) 616 ) 617 618 def test_badhandlerresults(self): 619 results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) 620 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") 621 622 for res in results: 623 codecs.register_error("test.badhandler", lambda x: res) 624 for enc in encs: 625 self.assertRaises( 626 TypeError, 627 u"\u3042".encode, 628 enc, 629 "test.badhandler" 630 ) 631 for (enc, bytes) in ( 632 ("ascii", "\xff"), 633 ("utf-8", "\xff"), 634 ("utf-7", "+x-"), 635 ("unicode-internal", "\x00"), 636 ): 637 self.assertRaises( 638 TypeError, 639 bytes.decode, 640 enc, 641 "test.badhandler" 642 ) 643 644 def test_lookup(self): 645 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 646 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) 647 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 648 self.assertEqual( 649 codecs.xmlcharrefreplace_errors, 650 codecs.lookup_error("xmlcharrefreplace") 651 ) 652 self.assertEqual( 653 codecs.backslashreplace_errors, 654 codecs.lookup_error("backslashreplace") 655 ) 656 657 def test_unencodablereplacement(self): 658 def unencrepl(exc): 659 if isinstance(exc, UnicodeEncodeError): 660 return (u"\u4242", exc.end) 661 else: 662 raise TypeError("don't know how to handle %r" % exc) 663 codecs.register_error("test.unencreplhandler", unencrepl) 664 for enc in ("ascii", "iso-8859-1", "iso-8859-15"): 665 self.assertRaises( 666 UnicodeEncodeError, 667 u"\u4242".encode, 668 enc, 669 "test.unencreplhandler" 670 ) 671 672 def test_badregistercall(self): 673 # enhance coverage of: 674 # Modules/_codecsmodule.c::register_error() 675 # Python/codecs.c::PyCodec_RegisterError() 676 self.assertRaises(TypeError, codecs.register_error, 42) 677 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) 678 679 def test_badlookupcall(self): 680 # enhance coverage of: 681 # Modules/_codecsmodule.c::lookup_error() 682 self.assertRaises(TypeError, codecs.lookup_error) 683 684 def test_unknownhandler(self): 685 # enhance coverage of: 686 # Modules/_codecsmodule.c::lookup_error() 687 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") 688 689 def test_xmlcharrefvalues(self): 690 # enhance coverage of: 691 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() 692 # and inline implementations 693 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) 694 if sys.maxunicode>=100000: 695 v += (100000, 500000, 1000000) 696 s = u"".join([unichr(x) for x in v]) 697 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) 698 for enc in ("ascii", "iso-8859-15"): 699 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): 700 s.encode(enc, err) 701 702 def test_decodehelper(self): 703 # enhance coverage of: 704 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() 705 # and callers 706 self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown") 707 708 def baddecodereturn1(exc): 709 return 42 710 codecs.register_error("test.baddecodereturn1", baddecodereturn1) 711 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1") 712 self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1") 713 self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1") 714 self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1") 715 self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") 716 self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") 717 718 def baddecodereturn2(exc): 719 return (u"?", None) 720 codecs.register_error("test.baddecodereturn2", baddecodereturn2) 721 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2") 722 723 handler = PosReturn() 724 codecs.register_error("test.posreturn", handler.handle) 725 726 # Valid negative position 727 handler.pos = -1 728 self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0") 729 730 # Valid negative position 731 handler.pos = -2 732 self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?><?>") 733 734 # Negative position out of bounds 735 handler.pos = -3 736 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") 737 738 # Valid positive position 739 handler.pos = 1 740 self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0") 741 742 # Largest valid positive position (one beyond end of input) 743 handler.pos = 2 744 self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>") 745 746 # Invalid positive position 747 handler.pos = 3 748 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") 749 750 # Restart at the "0" 751 handler.pos = 6 752 self.assertEqual("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0") 753 754 class D(dict): 755 def __getitem__(self, key): 756 raise ValueError 757 self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None}) 758 self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D()) 759 self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: 0x110000}) 760 761 def test_encodehelper(self): 762 # enhance coverage of: 763 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 764 # and callers 765 self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown") 766 767 def badencodereturn1(exc): 768 return 42 769 codecs.register_error("test.badencodereturn1", badencodereturn1) 770 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1") 771 772 def badencodereturn2(exc): 773 return (u"?", None) 774 codecs.register_error("test.badencodereturn2", badencodereturn2) 775 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2") 776 777 handler = PosReturn() 778 codecs.register_error("test.posreturn", handler.handle) 779 780 # Valid negative position 781 handler.pos = -1 782 self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") 783 784 # Valid negative position 785 handler.pos = -2 786 self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>") 787 788 # Negative position out of bounds 789 handler.pos = -3 790 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") 791 792 # Valid positive position 793 handler.pos = 1 794 self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") 795 796 # Largest valid positive position (one beyond end of input 797 handler.pos = 2 798 self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>") 799 800 # Invalid positive position 801 handler.pos = 3 802 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") 803 804 handler.pos = 0 805 806 class D(dict): 807 def __getitem__(self, key): 808 raise ValueError 809 for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): 810 self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None}) 811 self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D()) 812 self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300}) 813 814 def test_translatehelper(self): 815 # enhance coverage of: 816 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 817 # and callers 818 # (Unfortunately the errors argument is not directly accessible 819 # from Python, so we can't test that much) 820 class D(dict): 821 def __getitem__(self, key): 822 raise ValueError 823 self.assertRaises(ValueError, u"\xff".translate, D()) 824 self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1}) 825 self.assertRaises(TypeError, u"\xff".translate, {0xff: ()}) 826 827 def test_bug828737(self): 828 charmap = { 829 ord("&"): u"&", 830 ord("<"): u"<", 831 ord(">"): u">", 832 ord('"'): u""", 833 } 834 835 for n in (1, 10, 100, 1000): 836 text = u'abc<def>ghi'*n 837 text.translate(charmap) 838 839 def test_fake_error_class(self): 840 handlers = [ 841 codecs.strict_errors, 842 codecs.ignore_errors, 843 codecs.replace_errors, 844 codecs.backslashreplace_errors, 845 codecs.xmlcharrefreplace_errors, 846 ] 847 for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError: 848 class FakeUnicodeError(str): 849 __class__ = cls 850 for handler in handlers: 851 self.assertRaises(TypeError, handler, FakeUnicodeError()) 852 class FakeUnicodeError(Exception): 853 __class__ = cls 854 for handler in handlers: 855 with self.assertRaises((TypeError, FakeUnicodeError)): 856 handler(FakeUnicodeError()) 857 858 859def test_main(): 860 test.test_support.run_unittest(CodecCallbackTest) 861 862if __name__ == "__main__": 863 test_main() 864