1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8""" 9 10import builtins 11import sys 12 13### Registry and builtin stateless codec functions 14 15try: 16 from _codecs import * 17except ImportError as why: 18 raise SystemError('Failed to load the builtin codecs: %s' % why) 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 25 "StreamReader", "StreamWriter", 26 "StreamReaderWriter", "StreamRecoder", 27 "getencoder", "getdecoder", "getincrementalencoder", 28 "getincrementaldecoder", "getreader", "getwriter", 29 "encode", "decode", "iterencode", "iterdecode", 30 "strict_errors", "ignore_errors", "replace_errors", 31 "xmlcharrefreplace_errors", 32 "backslashreplace_errors", "namereplace_errors", 33 "register_error", "lookup_error"] 34 35### Constants 36 37# 38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 39# and its possible byte string values 40# for UTF8/UTF16/UTF32 output and little/big endian machines 41# 42 43# UTF-8 44BOM_UTF8 = b'\xef\xbb\xbf' 45 46# UTF-16, little endian 47BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 48 49# UTF-16, big endian 50BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 51 52# UTF-32, little endian 53BOM_UTF32_LE = b'\xff\xfe\x00\x00' 54 55# UTF-32, big endian 56BOM_UTF32_BE = b'\x00\x00\xfe\xff' 57 58if sys.byteorder == 'little': 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_LE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_LE 65 66else: 67 68 # UTF-16, native endianness 69 BOM = BOM_UTF16 = BOM_UTF16_BE 70 71 # UTF-32, native endianness 72 BOM_UTF32 = BOM_UTF32_BE 73 74# Old broken names (don't use in new code) 75BOM32_LE = BOM_UTF16_LE 76BOM32_BE = BOM_UTF16_BE 77BOM64_LE = BOM_UTF32_LE 78BOM64_BE = BOM_UTF32_BE 79 80 81### Codec base classes (defining the API) 82 83class CodecInfo(tuple): 84 """Codec details when looking up the codec registry""" 85 86 # Private API to allow Python 3.4 to denylist the known non-Unicode 87 # codecs in the standard library. A more general mechanism to 88 # reliably distinguish test encodings from other codecs will hopefully 89 # be defined for Python 3.5 90 # 91 # See http://bugs.python.org/issue19619 92 _is_text_encoding = True # Assume codecs are text encodings by default 93 94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 95 incrementalencoder=None, incrementaldecoder=None, name=None, 96 *, _is_text_encoding=None): 97 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 98 self.name = name 99 self.encode = encode 100 self.decode = decode 101 self.incrementalencoder = incrementalencoder 102 self.incrementaldecoder = incrementaldecoder 103 self.streamwriter = streamwriter 104 self.streamreader = streamreader 105 if _is_text_encoding is not None: 106 self._is_text_encoding = _is_text_encoding 107 return self 108 109 def __repr__(self): 110 return "<%s.%s object for encoding %s at %#x>" % \ 111 (self.__class__.__module__, self.__class__.__qualname__, 112 self.name, id(self)) 113 114 def __getnewargs__(self): 115 return tuple(self) 116 117class Codec: 118 119 """ Defines the interface for stateless encoders/decoders. 120 121 The .encode()/.decode() methods may use different error 122 handling schemes by providing the errors argument. These 123 string values are predefined: 124 125 'strict' - raise a ValueError error (or a subclass) 126 'ignore' - ignore the character and continue with the next 127 'replace' - replace with a suitable replacement character; 128 Python will use the official U+FFFD REPLACEMENT 129 CHARACTER for the builtin Unicode codecs on 130 decoding and '?' on encoding. 131 'surrogateescape' - replace with private code points U+DCnn. 132 'xmlcharrefreplace' - Replace with the appropriate XML 133 character reference (only for encoding). 134 'backslashreplace' - Replace with backslashed escape sequences. 135 'namereplace' - Replace with \\N{...} escape sequences 136 (only for encoding). 137 138 The set of allowed values can be extended via register_error. 139 140 """ 141 def encode(self, input, errors='strict'): 142 143 """ Encodes the object input and returns a tuple (output 144 object, length consumed). 145 146 errors defines the error handling to apply. It defaults to 147 'strict' handling. 148 149 The method may not store state in the Codec instance. Use 150 StreamWriter for codecs which have to keep state in order to 151 make encoding efficient. 152 153 The encoder must be able to handle zero length input and 154 return an empty object of the output object type in this 155 situation. 156 157 """ 158 raise NotImplementedError 159 160 def decode(self, input, errors='strict'): 161 162 """ Decodes the object input and returns a tuple (output 163 object, length consumed). 164 165 input must be an object which provides the bf_getreadbuf 166 buffer slot. Python strings, buffer objects and memory 167 mapped files are examples of objects providing this slot. 168 169 errors defines the error handling to apply. It defaults to 170 'strict' handling. 171 172 The method may not store state in the Codec instance. Use 173 StreamReader for codecs which have to keep state in order to 174 make decoding efficient. 175 176 The decoder must be able to handle zero length input and 177 return an empty object of the output object type in this 178 situation. 179 180 """ 181 raise NotImplementedError 182 183class IncrementalEncoder(object): 184 """ 185 An IncrementalEncoder encodes an input in multiple steps. The input can 186 be passed piece by piece to the encode() method. The IncrementalEncoder 187 remembers the state of the encoding process between calls to encode(). 188 """ 189 def __init__(self, errors='strict'): 190 """ 191 Creates an IncrementalEncoder instance. 192 193 The IncrementalEncoder may use different error handling schemes by 194 providing the errors keyword argument. See the module docstring 195 for a list of possible values. 196 """ 197 self.errors = errors 198 self.buffer = "" 199 200 def encode(self, input, final=False): 201 """ 202 Encodes input and returns the resulting object. 203 """ 204 raise NotImplementedError 205 206 def reset(self): 207 """ 208 Resets the encoder to the initial state. 209 """ 210 211 def getstate(self): 212 """ 213 Return the current state of the encoder. 214 """ 215 return 0 216 217 def setstate(self, state): 218 """ 219 Set the current state of the encoder. state must have been 220 returned by getstate(). 221 """ 222 223class BufferedIncrementalEncoder(IncrementalEncoder): 224 """ 225 This subclass of IncrementalEncoder can be used as the baseclass for an 226 incremental encoder if the encoder must keep some of the output in a 227 buffer between calls to encode(). 228 """ 229 def __init__(self, errors='strict'): 230 IncrementalEncoder.__init__(self, errors) 231 # unencoded input that is kept between calls to encode() 232 self.buffer = "" 233 234 def _buffer_encode(self, input, errors, final): 235 # Overwrite this method in subclasses: It must encode input 236 # and return an (output, length consumed) tuple 237 raise NotImplementedError 238 239 def encode(self, input, final=False): 240 # encode input (taking the buffer into account) 241 data = self.buffer + input 242 (result, consumed) = self._buffer_encode(data, self.errors, final) 243 # keep unencoded input until the next call 244 self.buffer = data[consumed:] 245 return result 246 247 def reset(self): 248 IncrementalEncoder.reset(self) 249 self.buffer = "" 250 251 def getstate(self): 252 return self.buffer or 0 253 254 def setstate(self, state): 255 self.buffer = state or "" 256 257class IncrementalDecoder(object): 258 """ 259 An IncrementalDecoder decodes an input in multiple steps. The input can 260 be passed piece by piece to the decode() method. The IncrementalDecoder 261 remembers the state of the decoding process between calls to decode(). 262 """ 263 def __init__(self, errors='strict'): 264 """ 265 Create an IncrementalDecoder instance. 266 267 The IncrementalDecoder may use different error handling schemes by 268 providing the errors keyword argument. See the module docstring 269 for a list of possible values. 270 """ 271 self.errors = errors 272 273 def decode(self, input, final=False): 274 """ 275 Decode input and returns the resulting object. 276 """ 277 raise NotImplementedError 278 279 def reset(self): 280 """ 281 Reset the decoder to the initial state. 282 """ 283 284 def getstate(self): 285 """ 286 Return the current state of the decoder. 287 288 This must be a (buffered_input, additional_state_info) tuple. 289 buffered_input must be a bytes object containing bytes that 290 were passed to decode() that have not yet been converted. 291 additional_state_info must be a non-negative integer 292 representing the state of the decoder WITHOUT yet having 293 processed the contents of buffered_input. In the initial state 294 and after reset(), getstate() must return (b"", 0). 295 """ 296 return (b"", 0) 297 298 def setstate(self, state): 299 """ 300 Set the current state of the decoder. 301 302 state must have been returned by getstate(). The effect of 303 setstate((b"", 0)) must be equivalent to reset(). 304 """ 305 306class BufferedIncrementalDecoder(IncrementalDecoder): 307 """ 308 This subclass of IncrementalDecoder can be used as the baseclass for an 309 incremental decoder if the decoder must be able to handle incomplete 310 byte sequences. 311 """ 312 def __init__(self, errors='strict'): 313 IncrementalDecoder.__init__(self, errors) 314 # undecoded input that is kept between calls to decode() 315 self.buffer = b"" 316 317 def _buffer_decode(self, input, errors, final): 318 # Overwrite this method in subclasses: It must decode input 319 # and return an (output, length consumed) tuple 320 raise NotImplementedError 321 322 def decode(self, input, final=False): 323 # decode input (taking the buffer into account) 324 data = self.buffer + input 325 (result, consumed) = self._buffer_decode(data, self.errors, final) 326 # keep undecoded input until the next call 327 self.buffer = data[consumed:] 328 return result 329 330 def reset(self): 331 IncrementalDecoder.reset(self) 332 self.buffer = b"" 333 334 def getstate(self): 335 # additional state info is always 0 336 return (self.buffer, 0) 337 338 def setstate(self, state): 339 # ignore additional state info 340 self.buffer = state[0] 341 342# 343# The StreamWriter and StreamReader class provide generic working 344# interfaces which can be used to implement new encoding submodules 345# very easily. See encodings/utf_8.py for an example on how this is 346# done. 347# 348 349class StreamWriter(Codec): 350 351 def __init__(self, stream, errors='strict'): 352 353 """ Creates a StreamWriter instance. 354 355 stream must be a file-like object open for writing. 356 357 The StreamWriter may use different error handling 358 schemes by providing the errors keyword argument. These 359 parameters are predefined: 360 361 'strict' - raise a ValueError (or a subclass) 362 'ignore' - ignore the character and continue with the next 363 'replace'- replace with a suitable replacement character 364 'xmlcharrefreplace' - Replace with the appropriate XML 365 character reference. 366 'backslashreplace' - Replace with backslashed escape 367 sequences. 368 'namereplace' - Replace with \\N{...} escape sequences. 369 370 The set of allowed parameter values can be extended via 371 register_error. 372 """ 373 self.stream = stream 374 self.errors = errors 375 376 def write(self, object): 377 378 """ Writes the object's contents encoded to self.stream. 379 """ 380 data, consumed = self.encode(object, self.errors) 381 self.stream.write(data) 382 383 def writelines(self, list): 384 385 """ Writes the concatenated list of strings to the stream 386 using .write(). 387 """ 388 self.write(''.join(list)) 389 390 def reset(self): 391 392 """ Resets the codec buffers used for keeping internal state. 393 394 Calling this method should ensure that the data on the 395 output is put into a clean state, that allows appending 396 of new fresh data without having to rescan the whole 397 stream to recover state. 398 399 """ 400 pass 401 402 def seek(self, offset, whence=0): 403 self.stream.seek(offset, whence) 404 if whence == 0 and offset == 0: 405 self.reset() 406 407 def __getattr__(self, name, 408 getattr=getattr): 409 410 """ Inherit all other methods from the underlying stream. 411 """ 412 return getattr(self.stream, name) 413 414 def __enter__(self): 415 return self 416 417 def __exit__(self, type, value, tb): 418 self.stream.close() 419 420 def __reduce_ex__(self, proto): 421 raise TypeError("can't serialize %s" % self.__class__.__name__) 422 423### 424 425class StreamReader(Codec): 426 427 charbuffertype = str 428 429 def __init__(self, stream, errors='strict'): 430 431 """ Creates a StreamReader instance. 432 433 stream must be a file-like object open for reading. 434 435 The StreamReader may use different error handling 436 schemes by providing the errors keyword argument. These 437 parameters are predefined: 438 439 'strict' - raise a ValueError (or a subclass) 440 'ignore' - ignore the character and continue with the next 441 'replace'- replace with a suitable replacement character 442 'backslashreplace' - Replace with backslashed escape sequences; 443 444 The set of allowed parameter values can be extended via 445 register_error. 446 """ 447 self.stream = stream 448 self.errors = errors 449 self.bytebuffer = b"" 450 self._empty_charbuffer = self.charbuffertype() 451 self.charbuffer = self._empty_charbuffer 452 self.linebuffer = None 453 454 def decode(self, input, errors='strict'): 455 raise NotImplementedError 456 457 def read(self, size=-1, chars=-1, firstline=False): 458 459 """ Decodes data from the stream self.stream and returns the 460 resulting object. 461 462 chars indicates the number of decoded code points or bytes to 463 return. read() will never return more data than requested, 464 but it might return less, if there is not enough available. 465 466 size indicates the approximate maximum number of decoded 467 bytes or code points to read for decoding. The decoder 468 can modify this setting as appropriate. The default value 469 -1 indicates to read and decode as much as possible. size 470 is intended to prevent having to decode huge files in one 471 step. 472 473 If firstline is true, and a UnicodeDecodeError happens 474 after the first line terminator in the input only the first line 475 will be returned, the rest of the input will be kept until the 476 next call to read(). 477 478 The method should use a greedy read strategy, meaning that 479 it should read as much data as is allowed within the 480 definition of the encoding and the given size, e.g. if 481 optional encoding endings or state markers are available 482 on the stream, these should be read too. 483 """ 484 # If we have lines cached, first merge them back into characters 485 if self.linebuffer: 486 self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 487 self.linebuffer = None 488 489 if chars < 0: 490 # For compatibility with other read() methods that take a 491 # single argument 492 chars = size 493 494 # read until we get the required number of characters (if available) 495 while True: 496 # can the request be satisfied from the character buffer? 497 if chars >= 0: 498 if len(self.charbuffer) >= chars: 499 break 500 # we need more data 501 if size < 0: 502 newdata = self.stream.read() 503 else: 504 newdata = self.stream.read(size) 505 # decode bytes (those remaining from the last call included) 506 data = self.bytebuffer + newdata 507 if not data: 508 break 509 try: 510 newchars, decodedbytes = self.decode(data, self.errors) 511 except UnicodeDecodeError as exc: 512 if firstline: 513 newchars, decodedbytes = \ 514 self.decode(data[:exc.start], self.errors) 515 lines = newchars.splitlines(keepends=True) 516 if len(lines)<=1: 517 raise 518 else: 519 raise 520 # keep undecoded bytes until the next call 521 self.bytebuffer = data[decodedbytes:] 522 # put new characters in the character buffer 523 self.charbuffer += newchars 524 # there was no data available 525 if not newdata: 526 break 527 if chars < 0: 528 # Return everything we've got 529 result = self.charbuffer 530 self.charbuffer = self._empty_charbuffer 531 else: 532 # Return the first chars characters 533 result = self.charbuffer[:chars] 534 self.charbuffer = self.charbuffer[chars:] 535 return result 536 537 def readline(self, size=None, keepends=True): 538 539 """ Read one line from the input stream and return the 540 decoded data. 541 542 size, if given, is passed as size argument to the 543 read() method. 544 545 """ 546 # If we have lines cached from an earlier read, return 547 # them unconditionally 548 if self.linebuffer: 549 line = self.linebuffer[0] 550 del self.linebuffer[0] 551 if len(self.linebuffer) == 1: 552 # revert to charbuffer mode; we might need more data 553 # next time 554 self.charbuffer = self.linebuffer[0] 555 self.linebuffer = None 556 if not keepends: 557 line = line.splitlines(keepends=False)[0] 558 return line 559 560 readsize = size or 72 561 line = self._empty_charbuffer 562 # If size is given, we call read() only once 563 while True: 564 data = self.read(readsize, firstline=True) 565 if data: 566 # If we're at a "\r" read one extra character (which might 567 # be a "\n") to get a proper line ending. If the stream is 568 # temporarily exhausted we return the wrong line ending. 569 if (isinstance(data, str) and data.endswith("\r")) or \ 570 (isinstance(data, bytes) and data.endswith(b"\r")): 571 data += self.read(size=1, chars=1) 572 573 line += data 574 lines = line.splitlines(keepends=True) 575 if lines: 576 if len(lines) > 1: 577 # More than one line result; the first line is a full line 578 # to return 579 line = lines[0] 580 del lines[0] 581 if len(lines) > 1: 582 # cache the remaining lines 583 lines[-1] += self.charbuffer 584 self.linebuffer = lines 585 self.charbuffer = None 586 else: 587 # only one remaining line, put it back into charbuffer 588 self.charbuffer = lines[0] + self.charbuffer 589 if not keepends: 590 line = line.splitlines(keepends=False)[0] 591 break 592 line0withend = lines[0] 593 line0withoutend = lines[0].splitlines(keepends=False)[0] 594 if line0withend != line0withoutend: # We really have a line end 595 # Put the rest back together and keep it until the next call 596 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 597 self.charbuffer 598 if keepends: 599 line = line0withend 600 else: 601 line = line0withoutend 602 break 603 # we didn't get anything or this was our only try 604 if not data or size is not None: 605 if line and not keepends: 606 line = line.splitlines(keepends=False)[0] 607 break 608 if readsize < 8000: 609 readsize *= 2 610 return line 611 612 def readlines(self, sizehint=None, keepends=True): 613 614 """ Read all lines available on the input stream 615 and return them as a list. 616 617 Line breaks are implemented using the codec's decoder 618 method and are included in the list entries. 619 620 sizehint, if given, is ignored since there is no efficient 621 way to finding the true end-of-line. 622 623 """ 624 data = self.read() 625 return data.splitlines(keepends) 626 627 def reset(self): 628 629 """ Resets the codec buffers used for keeping internal state. 630 631 Note that no stream repositioning should take place. 632 This method is primarily intended to be able to recover 633 from decoding errors. 634 635 """ 636 self.bytebuffer = b"" 637 self.charbuffer = self._empty_charbuffer 638 self.linebuffer = None 639 640 def seek(self, offset, whence=0): 641 """ Set the input stream's current position. 642 643 Resets the codec buffers used for keeping state. 644 """ 645 self.stream.seek(offset, whence) 646 self.reset() 647 648 def __next__(self): 649 650 """ Return the next decoded line from the input stream.""" 651 line = self.readline() 652 if line: 653 return line 654 raise StopIteration 655 656 def __iter__(self): 657 return self 658 659 def __getattr__(self, name, 660 getattr=getattr): 661 662 """ Inherit all other methods from the underlying stream. 663 """ 664 return getattr(self.stream, name) 665 666 def __enter__(self): 667 return self 668 669 def __exit__(self, type, value, tb): 670 self.stream.close() 671 672 def __reduce_ex__(self, proto): 673 raise TypeError("can't serialize %s" % self.__class__.__name__) 674 675### 676 677class StreamReaderWriter: 678 679 """ StreamReaderWriter instances allow wrapping streams which 680 work in both read and write modes. 681 682 The design is such that one can use the factory functions 683 returned by the codec.lookup() function to construct the 684 instance. 685 686 """ 687 # Optional attributes set by the file wrappers below 688 encoding = 'unknown' 689 690 def __init__(self, stream, Reader, Writer, errors='strict'): 691 692 """ Creates a StreamReaderWriter instance. 693 694 stream must be a Stream-like object. 695 696 Reader, Writer must be factory functions or classes 697 providing the StreamReader, StreamWriter interface resp. 698 699 Error handling is done in the same way as defined for the 700 StreamWriter/Readers. 701 702 """ 703 self.stream = stream 704 self.reader = Reader(stream, errors) 705 self.writer = Writer(stream, errors) 706 self.errors = errors 707 708 def read(self, size=-1): 709 710 return self.reader.read(size) 711 712 def readline(self, size=None): 713 714 return self.reader.readline(size) 715 716 def readlines(self, sizehint=None): 717 718 return self.reader.readlines(sizehint) 719 720 def __next__(self): 721 722 """ Return the next decoded line from the input stream.""" 723 return next(self.reader) 724 725 def __iter__(self): 726 return self 727 728 def write(self, data): 729 730 return self.writer.write(data) 731 732 def writelines(self, list): 733 734 return self.writer.writelines(list) 735 736 def reset(self): 737 738 self.reader.reset() 739 self.writer.reset() 740 741 def seek(self, offset, whence=0): 742 self.stream.seek(offset, whence) 743 self.reader.reset() 744 if whence == 0 and offset == 0: 745 self.writer.reset() 746 747 def __getattr__(self, name, 748 getattr=getattr): 749 750 """ Inherit all other methods from the underlying stream. 751 """ 752 return getattr(self.stream, name) 753 754 # these are needed to make "with StreamReaderWriter(...)" work properly 755 756 def __enter__(self): 757 return self 758 759 def __exit__(self, type, value, tb): 760 self.stream.close() 761 762 def __reduce_ex__(self, proto): 763 raise TypeError("can't serialize %s" % self.__class__.__name__) 764 765### 766 767class StreamRecoder: 768 769 """ StreamRecoder instances translate data from one encoding to another. 770 771 They use the complete set of APIs returned by the 772 codecs.lookup() function to implement their task. 773 774 Data written to the StreamRecoder is first decoded into an 775 intermediate format (depending on the "decode" codec) and then 776 written to the underlying stream using an instance of the provided 777 Writer class. 778 779 In the other direction, data is read from the underlying stream using 780 a Reader instance and then encoded and returned to the caller. 781 782 """ 783 # Optional attributes set by the file wrappers below 784 data_encoding = 'unknown' 785 file_encoding = 'unknown' 786 787 def __init__(self, stream, encode, decode, Reader, Writer, 788 errors='strict'): 789 790 """ Creates a StreamRecoder instance which implements a two-way 791 conversion: encode and decode work on the frontend (the 792 data visible to .read() and .write()) while Reader and Writer 793 work on the backend (the data in stream). 794 795 You can use these objects to do transparent 796 transcodings from e.g. latin-1 to utf-8 and back. 797 798 stream must be a file-like object. 799 800 encode and decode must adhere to the Codec interface; Reader and 801 Writer must be factory functions or classes providing the 802 StreamReader and StreamWriter interfaces resp. 803 804 Error handling is done in the same way as defined for the 805 StreamWriter/Readers. 806 807 """ 808 self.stream = stream 809 self.encode = encode 810 self.decode = decode 811 self.reader = Reader(stream, errors) 812 self.writer = Writer(stream, errors) 813 self.errors = errors 814 815 def read(self, size=-1): 816 817 data = self.reader.read(size) 818 data, bytesencoded = self.encode(data, self.errors) 819 return data 820 821 def readline(self, size=None): 822 823 if size is None: 824 data = self.reader.readline() 825 else: 826 data = self.reader.readline(size) 827 data, bytesencoded = self.encode(data, self.errors) 828 return data 829 830 def readlines(self, sizehint=None): 831 832 data = self.reader.read() 833 data, bytesencoded = self.encode(data, self.errors) 834 return data.splitlines(keepends=True) 835 836 def __next__(self): 837 838 """ Return the next decoded line from the input stream.""" 839 data = next(self.reader) 840 data, bytesencoded = self.encode(data, self.errors) 841 return data 842 843 def __iter__(self): 844 return self 845 846 def write(self, data): 847 848 data, bytesdecoded = self.decode(data, self.errors) 849 return self.writer.write(data) 850 851 def writelines(self, list): 852 853 data = b''.join(list) 854 data, bytesdecoded = self.decode(data, self.errors) 855 return self.writer.write(data) 856 857 def reset(self): 858 859 self.reader.reset() 860 self.writer.reset() 861 862 def seek(self, offset, whence=0): 863 # Seeks must be propagated to both the readers and writers 864 # as they might need to reset their internal buffers. 865 self.reader.seek(offset, whence) 866 self.writer.seek(offset, whence) 867 868 def __getattr__(self, name, 869 getattr=getattr): 870 871 """ Inherit all other methods from the underlying stream. 872 """ 873 return getattr(self.stream, name) 874 875 def __enter__(self): 876 return self 877 878 def __exit__(self, type, value, tb): 879 self.stream.close() 880 881 def __reduce_ex__(self, proto): 882 raise TypeError("can't serialize %s" % self.__class__.__name__) 883 884### Shortcuts 885 886def open(filename, mode='r', encoding=None, errors='strict', buffering=-1): 887 888 """ Open an encoded file using the given mode and return 889 a wrapped version providing transparent encoding/decoding. 890 891 Note: The wrapped version will only accept the object format 892 defined by the codecs, i.e. Unicode objects for most builtin 893 codecs. Output is also codec dependent and will usually be 894 Unicode as well. 895 896 If encoding is not None, then the 897 underlying encoded files are always opened in binary mode. 898 The default file mode is 'r', meaning to open the file in read mode. 899 900 encoding specifies the encoding which is to be used for the 901 file. 902 903 errors may be given to define the error handling. It defaults 904 to 'strict' which causes ValueErrors to be raised in case an 905 encoding error occurs. 906 907 buffering has the same meaning as for the builtin open() API. 908 It defaults to -1 which means that the default buffer size will 909 be used. 910 911 The returned wrapped file object provides an extra attribute 912 .encoding which allows querying the used encoding. This 913 attribute is only available if an encoding was specified as 914 parameter. 915 916 """ 917 if encoding is not None and \ 918 'b' not in mode: 919 # Force opening of the file in binary mode 920 mode = mode + 'b' 921 file = builtins.open(filename, mode, buffering) 922 if encoding is None: 923 return file 924 925 try: 926 info = lookup(encoding) 927 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 928 # Add attributes to simplify introspection 929 srw.encoding = encoding 930 return srw 931 except: 932 file.close() 933 raise 934 935def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 936 937 """ Return a wrapped version of file which provides transparent 938 encoding translation. 939 940 Data written to the wrapped file is decoded according 941 to the given data_encoding and then encoded to the underlying 942 file using file_encoding. The intermediate data type 943 will usually be Unicode but depends on the specified codecs. 944 945 Bytes read from the file are decoded using file_encoding and then 946 passed back to the caller encoded using data_encoding. 947 948 If file_encoding is not given, it defaults to data_encoding. 949 950 errors may be given to define the error handling. It defaults 951 to 'strict' which causes ValueErrors to be raised in case an 952 encoding error occurs. 953 954 The returned wrapped file object provides two extra attributes 955 .data_encoding and .file_encoding which reflect the given 956 parameters of the same name. The attributes can be used for 957 introspection by Python programs. 958 959 """ 960 if file_encoding is None: 961 file_encoding = data_encoding 962 data_info = lookup(data_encoding) 963 file_info = lookup(file_encoding) 964 sr = StreamRecoder(file, data_info.encode, data_info.decode, 965 file_info.streamreader, file_info.streamwriter, errors) 966 # Add attributes to simplify introspection 967 sr.data_encoding = data_encoding 968 sr.file_encoding = file_encoding 969 return sr 970 971### Helpers for codec lookup 972 973def getencoder(encoding): 974 975 """ Lookup up the codec for the given encoding and return 976 its encoder function. 977 978 Raises a LookupError in case the encoding cannot be found. 979 980 """ 981 return lookup(encoding).encode 982 983def getdecoder(encoding): 984 985 """ Lookup up the codec for the given encoding and return 986 its decoder function. 987 988 Raises a LookupError in case the encoding cannot be found. 989 990 """ 991 return lookup(encoding).decode 992 993def getincrementalencoder(encoding): 994 995 """ Lookup up the codec for the given encoding and return 996 its IncrementalEncoder class or factory function. 997 998 Raises a LookupError in case the encoding cannot be found 999 or the codecs doesn't provide an incremental encoder. 1000 1001 """ 1002 encoder = lookup(encoding).incrementalencoder 1003 if encoder is None: 1004 raise LookupError(encoding) 1005 return encoder 1006 1007def getincrementaldecoder(encoding): 1008 1009 """ Lookup up the codec for the given encoding and return 1010 its IncrementalDecoder class or factory function. 1011 1012 Raises a LookupError in case the encoding cannot be found 1013 or the codecs doesn't provide an incremental decoder. 1014 1015 """ 1016 decoder = lookup(encoding).incrementaldecoder 1017 if decoder is None: 1018 raise LookupError(encoding) 1019 return decoder 1020 1021def getreader(encoding): 1022 1023 """ Lookup up the codec for the given encoding and return 1024 its StreamReader class or factory function. 1025 1026 Raises a LookupError in case the encoding cannot be found. 1027 1028 """ 1029 return lookup(encoding).streamreader 1030 1031def getwriter(encoding): 1032 1033 """ Lookup up the codec for the given encoding and return 1034 its StreamWriter class or factory function. 1035 1036 Raises a LookupError in case the encoding cannot be found. 1037 1038 """ 1039 return lookup(encoding).streamwriter 1040 1041def iterencode(iterator, encoding, errors='strict', **kwargs): 1042 """ 1043 Encoding iterator. 1044 1045 Encodes the input strings from the iterator using an IncrementalEncoder. 1046 1047 errors and kwargs are passed through to the IncrementalEncoder 1048 constructor. 1049 """ 1050 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1051 for input in iterator: 1052 output = encoder.encode(input) 1053 if output: 1054 yield output 1055 output = encoder.encode("", True) 1056 if output: 1057 yield output 1058 1059def iterdecode(iterator, encoding, errors='strict', **kwargs): 1060 """ 1061 Decoding iterator. 1062 1063 Decodes the input strings from the iterator using an IncrementalDecoder. 1064 1065 errors and kwargs are passed through to the IncrementalDecoder 1066 constructor. 1067 """ 1068 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1069 for input in iterator: 1070 output = decoder.decode(input) 1071 if output: 1072 yield output 1073 output = decoder.decode(b"", True) 1074 if output: 1075 yield output 1076 1077### Helpers for charmap-based codecs 1078 1079def make_identity_dict(rng): 1080 1081 """ make_identity_dict(rng) -> dict 1082 1083 Return a dictionary where elements of the rng sequence are 1084 mapped to themselves. 1085 1086 """ 1087 return {i:i for i in rng} 1088 1089def make_encoding_map(decoding_map): 1090 1091 """ Creates an encoding map from a decoding map. 1092 1093 If a target mapping in the decoding map occurs multiple 1094 times, then that target is mapped to None (undefined mapping), 1095 causing an exception when encountered by the charmap codec 1096 during translation. 1097 1098 One example where this happens is cp875.py which decodes 1099 multiple character to \\u001a. 1100 1101 """ 1102 m = {} 1103 for k,v in decoding_map.items(): 1104 if not v in m: 1105 m[v] = k 1106 else: 1107 m[v] = None 1108 return m 1109 1110### error handlers 1111 1112try: 1113 strict_errors = lookup_error("strict") 1114 ignore_errors = lookup_error("ignore") 1115 replace_errors = lookup_error("replace") 1116 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1117 backslashreplace_errors = lookup_error("backslashreplace") 1118 namereplace_errors = lookup_error("namereplace") 1119except LookupError: 1120 # In --disable-unicode builds, these error handler are missing 1121 strict_errors = None 1122 ignore_errors = None 1123 replace_errors = None 1124 xmlcharrefreplace_errors = None 1125 backslashreplace_errors = None 1126 namereplace_errors = None 1127 1128# Tell modulefinder that using codecs probably needs the encodings 1129# package 1130_false = 0 1131if _false: 1132 import encodings 1133