1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8""" 9 10import builtins 11import sys 12 13### Registry and builtin stateless codec functions 14 15try: 16 from _codecs import * 17except ImportError as why: 18 raise SystemError('Failed to load the builtin codecs: %s' % why) 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 25 "StreamReader", "StreamWriter", 26 "StreamReaderWriter", "StreamRecoder", 27 "getencoder", "getdecoder", "getincrementalencoder", 28 "getincrementaldecoder", "getreader", "getwriter", 29 "encode", "decode", "iterencode", "iterdecode", 30 "strict_errors", "ignore_errors", "replace_errors", 31 "xmlcharrefreplace_errors", 32 "backslashreplace_errors", "namereplace_errors", 33 "register_error", "lookup_error"] 34 35### Constants 36 37# 38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 39# and its possible byte string values 40# for UTF8/UTF16/UTF32 output and little/big endian machines 41# 42 43# UTF-8 44BOM_UTF8 = b'\xef\xbb\xbf' 45 46# UTF-16, little endian 47BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 48 49# UTF-16, big endian 50BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 51 52# UTF-32, little endian 53BOM_UTF32_LE = b'\xff\xfe\x00\x00' 54 55# UTF-32, big endian 56BOM_UTF32_BE = b'\x00\x00\xfe\xff' 57 58if sys.byteorder == 'little': 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_LE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_LE 65 66else: 67 68 # UTF-16, native endianness 69 BOM = BOM_UTF16 = BOM_UTF16_BE 70 71 # UTF-32, native endianness 72 BOM_UTF32 = BOM_UTF32_BE 73 74# Old broken names (don't use in new code) 75BOM32_LE = BOM_UTF16_LE 76BOM32_BE = BOM_UTF16_BE 77BOM64_LE = BOM_UTF32_LE 78BOM64_BE = BOM_UTF32_BE 79 80 81### Codec base classes (defining the API) 82 83class CodecInfo(tuple): 84 """Codec details when looking up the codec registry""" 85 86 # Private API to allow Python 3.4 to blacklist the known non-Unicode 87 # codecs in the standard library. A more general mechanism to 88 # reliably distinguish test encodings from other codecs will hopefully 89 # be defined for Python 3.5 90 # 91 # See http://bugs.python.org/issue19619 92 _is_text_encoding = True # Assume codecs are text encodings by default 93 94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 95 incrementalencoder=None, incrementaldecoder=None, name=None, 96 *, _is_text_encoding=None): 97 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 98 self.name = name 99 self.encode = encode 100 self.decode = decode 101 self.incrementalencoder = incrementalencoder 102 self.incrementaldecoder = incrementaldecoder 103 self.streamwriter = streamwriter 104 self.streamreader = streamreader 105 if _is_text_encoding is not None: 106 self._is_text_encoding = _is_text_encoding 107 return self 108 109 def __repr__(self): 110 return "<%s.%s object for encoding %s at %#x>" % \ 111 (self.__class__.__module__, self.__class__.__qualname__, 112 self.name, id(self)) 113 114class Codec: 115 116 """ Defines the interface for stateless encoders/decoders. 117 118 The .encode()/.decode() methods may use different error 119 handling schemes by providing the errors argument. These 120 string values are predefined: 121 122 'strict' - raise a ValueError error (or a subclass) 123 'ignore' - ignore the character and continue with the next 124 'replace' - replace with a suitable replacement character; 125 Python will use the official U+FFFD REPLACEMENT 126 CHARACTER for the builtin Unicode codecs on 127 decoding and '?' on encoding. 128 'surrogateescape' - replace with private code points U+DCnn. 129 'xmlcharrefreplace' - Replace with the appropriate XML 130 character reference (only for encoding). 131 'backslashreplace' - Replace with backslashed escape sequences. 132 'namereplace' - Replace with \\N{...} escape sequences 133 (only for encoding). 134 135 The set of allowed values can be extended via register_error. 136 137 """ 138 def encode(self, input, errors='strict'): 139 140 """ Encodes the object input and returns a tuple (output 141 object, length consumed). 142 143 errors defines the error handling to apply. It defaults to 144 'strict' handling. 145 146 The method may not store state in the Codec instance. Use 147 StreamWriter for codecs which have to keep state in order to 148 make encoding efficient. 149 150 The encoder must be able to handle zero length input and 151 return an empty object of the output object type in this 152 situation. 153 154 """ 155 raise NotImplementedError 156 157 def decode(self, input, errors='strict'): 158 159 """ Decodes the object input and returns a tuple (output 160 object, length consumed). 161 162 input must be an object which provides the bf_getreadbuf 163 buffer slot. Python strings, buffer objects and memory 164 mapped files are examples of objects providing this slot. 165 166 errors defines the error handling to apply. It defaults to 167 'strict' handling. 168 169 The method may not store state in the Codec instance. Use 170 StreamReader for codecs which have to keep state in order to 171 make decoding efficient. 172 173 The decoder must be able to handle zero length input and 174 return an empty object of the output object type in this 175 situation. 176 177 """ 178 raise NotImplementedError 179 180class IncrementalEncoder(object): 181 """ 182 An IncrementalEncoder encodes an input in multiple steps. The input can 183 be passed piece by piece to the encode() method. The IncrementalEncoder 184 remembers the state of the encoding process between calls to encode(). 185 """ 186 def __init__(self, errors='strict'): 187 """ 188 Creates an IncrementalEncoder instance. 189 190 The IncrementalEncoder may use different error handling schemes by 191 providing the errors keyword argument. See the module docstring 192 for a list of possible values. 193 """ 194 self.errors = errors 195 self.buffer = "" 196 197 def encode(self, input, final=False): 198 """ 199 Encodes input and returns the resulting object. 200 """ 201 raise NotImplementedError 202 203 def reset(self): 204 """ 205 Resets the encoder to the initial state. 206 """ 207 208 def getstate(self): 209 """ 210 Return the current state of the encoder. 211 """ 212 return 0 213 214 def setstate(self, state): 215 """ 216 Set the current state of the encoder. state must have been 217 returned by getstate(). 218 """ 219 220class BufferedIncrementalEncoder(IncrementalEncoder): 221 """ 222 This subclass of IncrementalEncoder can be used as the baseclass for an 223 incremental encoder if the encoder must keep some of the output in a 224 buffer between calls to encode(). 225 """ 226 def __init__(self, errors='strict'): 227 IncrementalEncoder.__init__(self, errors) 228 # unencoded input that is kept between calls to encode() 229 self.buffer = "" 230 231 def _buffer_encode(self, input, errors, final): 232 # Overwrite this method in subclasses: It must encode input 233 # and return an (output, length consumed) tuple 234 raise NotImplementedError 235 236 def encode(self, input, final=False): 237 # encode input (taking the buffer into account) 238 data = self.buffer + input 239 (result, consumed) = self._buffer_encode(data, self.errors, final) 240 # keep unencoded input until the next call 241 self.buffer = data[consumed:] 242 return result 243 244 def reset(self): 245 IncrementalEncoder.reset(self) 246 self.buffer = "" 247 248 def getstate(self): 249 return self.buffer or 0 250 251 def setstate(self, state): 252 self.buffer = state or "" 253 254class IncrementalDecoder(object): 255 """ 256 An IncrementalDecoder decodes an input in multiple steps. The input can 257 be passed piece by piece to the decode() method. The IncrementalDecoder 258 remembers the state of the decoding process between calls to decode(). 259 """ 260 def __init__(self, errors='strict'): 261 """ 262 Create an IncrementalDecoder instance. 263 264 The IncrementalDecoder may use different error handling schemes by 265 providing the errors keyword argument. See the module docstring 266 for a list of possible values. 267 """ 268 self.errors = errors 269 270 def decode(self, input, final=False): 271 """ 272 Decode input and returns the resulting object. 273 """ 274 raise NotImplementedError 275 276 def reset(self): 277 """ 278 Reset the decoder to the initial state. 279 """ 280 281 def getstate(self): 282 """ 283 Return the current state of the decoder. 284 285 This must be a (buffered_input, additional_state_info) tuple. 286 buffered_input must be a bytes object containing bytes that 287 were passed to decode() that have not yet been converted. 288 additional_state_info must be a non-negative integer 289 representing the state of the decoder WITHOUT yet having 290 processed the contents of buffered_input. In the initial state 291 and after reset(), getstate() must return (b"", 0). 292 """ 293 return (b"", 0) 294 295 def setstate(self, state): 296 """ 297 Set the current state of the decoder. 298 299 state must have been returned by getstate(). The effect of 300 setstate((b"", 0)) must be equivalent to reset(). 301 """ 302 303class BufferedIncrementalDecoder(IncrementalDecoder): 304 """ 305 This subclass of IncrementalDecoder can be used as the baseclass for an 306 incremental decoder if the decoder must be able to handle incomplete 307 byte sequences. 308 """ 309 def __init__(self, errors='strict'): 310 IncrementalDecoder.__init__(self, errors) 311 # undecoded input that is kept between calls to decode() 312 self.buffer = b"" 313 314 def _buffer_decode(self, input, errors, final): 315 # Overwrite this method in subclasses: It must decode input 316 # and return an (output, length consumed) tuple 317 raise NotImplementedError 318 319 def decode(self, input, final=False): 320 # decode input (taking the buffer into account) 321 data = self.buffer + input 322 (result, consumed) = self._buffer_decode(data, self.errors, final) 323 # keep undecoded input until the next call 324 self.buffer = data[consumed:] 325 return result 326 327 def reset(self): 328 IncrementalDecoder.reset(self) 329 self.buffer = b"" 330 331 def getstate(self): 332 # additional state info is always 0 333 return (self.buffer, 0) 334 335 def setstate(self, state): 336 # ignore additional state info 337 self.buffer = state[0] 338 339# 340# The StreamWriter and StreamReader class provide generic working 341# interfaces which can be used to implement new encoding submodules 342# very easily. See encodings/utf_8.py for an example on how this is 343# done. 344# 345 346class StreamWriter(Codec): 347 348 def __init__(self, stream, errors='strict'): 349 350 """ Creates a StreamWriter instance. 351 352 stream must be a file-like object open for writing. 353 354 The StreamWriter may use different error handling 355 schemes by providing the errors keyword argument. These 356 parameters are predefined: 357 358 'strict' - raise a ValueError (or a subclass) 359 'ignore' - ignore the character and continue with the next 360 'replace'- replace with a suitable replacement character 361 'xmlcharrefreplace' - Replace with the appropriate XML 362 character reference. 363 'backslashreplace' - Replace with backslashed escape 364 sequences. 365 'namereplace' - Replace with \\N{...} escape sequences. 366 367 The set of allowed parameter values can be extended via 368 register_error. 369 """ 370 self.stream = stream 371 self.errors = errors 372 373 def write(self, object): 374 375 """ Writes the object's contents encoded to self.stream. 376 """ 377 data, consumed = self.encode(object, self.errors) 378 self.stream.write(data) 379 380 def writelines(self, list): 381 382 """ Writes the concatenated list of strings to the stream 383 using .write(). 384 """ 385 self.write(''.join(list)) 386 387 def reset(self): 388 389 """ Flushes and resets the codec buffers used for keeping state. 390 391 Calling this method should ensure that the data on the 392 output is put into a clean state, that allows appending 393 of new fresh data without having to rescan the whole 394 stream to recover state. 395 396 """ 397 pass 398 399 def seek(self, offset, whence=0): 400 self.stream.seek(offset, whence) 401 if whence == 0 and offset == 0: 402 self.reset() 403 404 def __getattr__(self, name, 405 getattr=getattr): 406 407 """ Inherit all other methods from the underlying stream. 408 """ 409 return getattr(self.stream, name) 410 411 def __enter__(self): 412 return self 413 414 def __exit__(self, type, value, tb): 415 self.stream.close() 416 417### 418 419class StreamReader(Codec): 420 421 charbuffertype = str 422 423 def __init__(self, stream, errors='strict'): 424 425 """ Creates a StreamReader instance. 426 427 stream must be a file-like object open for reading. 428 429 The StreamReader may use different error handling 430 schemes by providing the errors keyword argument. These 431 parameters are predefined: 432 433 'strict' - raise a ValueError (or a subclass) 434 'ignore' - ignore the character and continue with the next 435 'replace'- replace with a suitable replacement character 436 'backslashreplace' - Replace with backslashed escape sequences; 437 438 The set of allowed parameter values can be extended via 439 register_error. 440 """ 441 self.stream = stream 442 self.errors = errors 443 self.bytebuffer = b"" 444 self._empty_charbuffer = self.charbuffertype() 445 self.charbuffer = self._empty_charbuffer 446 self.linebuffer = None 447 448 def decode(self, input, errors='strict'): 449 raise NotImplementedError 450 451 def read(self, size=-1, chars=-1, firstline=False): 452 453 """ Decodes data from the stream self.stream and returns the 454 resulting object. 455 456 chars indicates the number of decoded code points or bytes to 457 return. read() will never return more data than requested, 458 but it might return less, if there is not enough available. 459 460 size indicates the approximate maximum number of decoded 461 bytes or code points to read for decoding. The decoder 462 can modify this setting as appropriate. The default value 463 -1 indicates to read and decode as much as possible. size 464 is intended to prevent having to decode huge files in one 465 step. 466 467 If firstline is true, and a UnicodeDecodeError happens 468 after the first line terminator in the input only the first line 469 will be returned, the rest of the input will be kept until the 470 next call to read(). 471 472 The method should use a greedy read strategy, meaning that 473 it should read as much data as is allowed within the 474 definition of the encoding and the given size, e.g. if 475 optional encoding endings or state markers are available 476 on the stream, these should be read too. 477 """ 478 # If we have lines cached, first merge them back into characters 479 if self.linebuffer: 480 self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 481 self.linebuffer = None 482 483 if chars < 0: 484 # For compatibility with other read() methods that take a 485 # single argument 486 chars = size 487 488 # read until we get the required number of characters (if available) 489 while True: 490 # can the request be satisfied from the character buffer? 491 if chars >= 0: 492 if len(self.charbuffer) >= chars: 493 break 494 # we need more data 495 if size < 0: 496 newdata = self.stream.read() 497 else: 498 newdata = self.stream.read(size) 499 # decode bytes (those remaining from the last call included) 500 data = self.bytebuffer + newdata 501 if not data: 502 break 503 try: 504 newchars, decodedbytes = self.decode(data, self.errors) 505 except UnicodeDecodeError as exc: 506 if firstline: 507 newchars, decodedbytes = \ 508 self.decode(data[:exc.start], self.errors) 509 lines = newchars.splitlines(keepends=True) 510 if len(lines)<=1: 511 raise 512 else: 513 raise 514 # keep undecoded bytes until the next call 515 self.bytebuffer = data[decodedbytes:] 516 # put new characters in the character buffer 517 self.charbuffer += newchars 518 # there was no data available 519 if not newdata: 520 break 521 if chars < 0: 522 # Return everything we've got 523 result = self.charbuffer 524 self.charbuffer = self._empty_charbuffer 525 else: 526 # Return the first chars characters 527 result = self.charbuffer[:chars] 528 self.charbuffer = self.charbuffer[chars:] 529 return result 530 531 def readline(self, size=None, keepends=True): 532 533 """ Read one line from the input stream and return the 534 decoded data. 535 536 size, if given, is passed as size argument to the 537 read() method. 538 539 """ 540 # If we have lines cached from an earlier read, return 541 # them unconditionally 542 if self.linebuffer: 543 line = self.linebuffer[0] 544 del self.linebuffer[0] 545 if len(self.linebuffer) == 1: 546 # revert to charbuffer mode; we might need more data 547 # next time 548 self.charbuffer = self.linebuffer[0] 549 self.linebuffer = None 550 if not keepends: 551 line = line.splitlines(keepends=False)[0] 552 return line 553 554 readsize = size or 72 555 line = self._empty_charbuffer 556 # If size is given, we call read() only once 557 while True: 558 data = self.read(readsize, firstline=True) 559 if data: 560 # If we're at a "\r" read one extra character (which might 561 # be a "\n") to get a proper line ending. If the stream is 562 # temporarily exhausted we return the wrong line ending. 563 if (isinstance(data, str) and data.endswith("\r")) or \ 564 (isinstance(data, bytes) and data.endswith(b"\r")): 565 data += self.read(size=1, chars=1) 566 567 line += data 568 lines = line.splitlines(keepends=True) 569 if lines: 570 if len(lines) > 1: 571 # More than one line result; the first line is a full line 572 # to return 573 line = lines[0] 574 del lines[0] 575 if len(lines) > 1: 576 # cache the remaining lines 577 lines[-1] += self.charbuffer 578 self.linebuffer = lines 579 self.charbuffer = None 580 else: 581 # only one remaining line, put it back into charbuffer 582 self.charbuffer = lines[0] + self.charbuffer 583 if not keepends: 584 line = line.splitlines(keepends=False)[0] 585 break 586 line0withend = lines[0] 587 line0withoutend = lines[0].splitlines(keepends=False)[0] 588 if line0withend != line0withoutend: # We really have a line end 589 # Put the rest back together and keep it until the next call 590 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 591 self.charbuffer 592 if keepends: 593 line = line0withend 594 else: 595 line = line0withoutend 596 break 597 # we didn't get anything or this was our only try 598 if not data or size is not None: 599 if line and not keepends: 600 line = line.splitlines(keepends=False)[0] 601 break 602 if readsize < 8000: 603 readsize *= 2 604 return line 605 606 def readlines(self, sizehint=None, keepends=True): 607 608 """ Read all lines available on the input stream 609 and return them as a list. 610 611 Line breaks are implemented using the codec's decoder 612 method and are included in the list entries. 613 614 sizehint, if given, is ignored since there is no efficient 615 way to finding the true end-of-line. 616 617 """ 618 data = self.read() 619 return data.splitlines(keepends) 620 621 def reset(self): 622 623 """ Resets the codec buffers used for keeping state. 624 625 Note that no stream repositioning should take place. 626 This method is primarily intended to be able to recover 627 from decoding errors. 628 629 """ 630 self.bytebuffer = b"" 631 self.charbuffer = self._empty_charbuffer 632 self.linebuffer = None 633 634 def seek(self, offset, whence=0): 635 """ Set the input stream's current position. 636 637 Resets the codec buffers used for keeping state. 638 """ 639 self.stream.seek(offset, whence) 640 self.reset() 641 642 def __next__(self): 643 644 """ Return the next decoded line from the input stream.""" 645 line = self.readline() 646 if line: 647 return line 648 raise StopIteration 649 650 def __iter__(self): 651 return self 652 653 def __getattr__(self, name, 654 getattr=getattr): 655 656 """ Inherit all other methods from the underlying stream. 657 """ 658 return getattr(self.stream, name) 659 660 def __enter__(self): 661 return self 662 663 def __exit__(self, type, value, tb): 664 self.stream.close() 665 666### 667 668class StreamReaderWriter: 669 670 """ StreamReaderWriter instances allow wrapping streams which 671 work in both read and write modes. 672 673 The design is such that one can use the factory functions 674 returned by the codec.lookup() function to construct the 675 instance. 676 677 """ 678 # Optional attributes set by the file wrappers below 679 encoding = 'unknown' 680 681 def __init__(self, stream, Reader, Writer, errors='strict'): 682 683 """ Creates a StreamReaderWriter instance. 684 685 stream must be a Stream-like object. 686 687 Reader, Writer must be factory functions or classes 688 providing the StreamReader, StreamWriter interface resp. 689 690 Error handling is done in the same way as defined for the 691 StreamWriter/Readers. 692 693 """ 694 self.stream = stream 695 self.reader = Reader(stream, errors) 696 self.writer = Writer(stream, errors) 697 self.errors = errors 698 699 def read(self, size=-1): 700 701 return self.reader.read(size) 702 703 def readline(self, size=None): 704 705 return self.reader.readline(size) 706 707 def readlines(self, sizehint=None): 708 709 return self.reader.readlines(sizehint) 710 711 def __next__(self): 712 713 """ Return the next decoded line from the input stream.""" 714 return next(self.reader) 715 716 def __iter__(self): 717 return self 718 719 def write(self, data): 720 721 return self.writer.write(data) 722 723 def writelines(self, list): 724 725 return self.writer.writelines(list) 726 727 def reset(self): 728 729 self.reader.reset() 730 self.writer.reset() 731 732 def seek(self, offset, whence=0): 733 self.stream.seek(offset, whence) 734 self.reader.reset() 735 if whence == 0 and offset == 0: 736 self.writer.reset() 737 738 def __getattr__(self, name, 739 getattr=getattr): 740 741 """ Inherit all other methods from the underlying stream. 742 """ 743 return getattr(self.stream, name) 744 745 # these are needed to make "with StreamReaderWriter(...)" work properly 746 747 def __enter__(self): 748 return self 749 750 def __exit__(self, type, value, tb): 751 self.stream.close() 752 753### 754 755class StreamRecoder: 756 757 """ StreamRecoder instances translate data from one encoding to another. 758 759 They use the complete set of APIs returned by the 760 codecs.lookup() function to implement their task. 761 762 Data written to the StreamRecoder is first decoded into an 763 intermediate format (depending on the "decode" codec) and then 764 written to the underlying stream using an instance of the provided 765 Writer class. 766 767 In the other direction, data is read from the underlying stream using 768 a Reader instance and then encoded and returned to the caller. 769 770 """ 771 # Optional attributes set by the file wrappers below 772 data_encoding = 'unknown' 773 file_encoding = 'unknown' 774 775 def __init__(self, stream, encode, decode, Reader, Writer, 776 errors='strict'): 777 778 """ Creates a StreamRecoder instance which implements a two-way 779 conversion: encode and decode work on the frontend (the 780 data visible to .read() and .write()) while Reader and Writer 781 work on the backend (the data in stream). 782 783 You can use these objects to do transparent 784 transcodings from e.g. latin-1 to utf-8 and back. 785 786 stream must be a file-like object. 787 788 encode and decode must adhere to the Codec interface; Reader and 789 Writer must be factory functions or classes providing the 790 StreamReader and StreamWriter interfaces resp. 791 792 Error handling is done in the same way as defined for the 793 StreamWriter/Readers. 794 795 """ 796 self.stream = stream 797 self.encode = encode 798 self.decode = decode 799 self.reader = Reader(stream, errors) 800 self.writer = Writer(stream, errors) 801 self.errors = errors 802 803 def read(self, size=-1): 804 805 data = self.reader.read(size) 806 data, bytesencoded = self.encode(data, self.errors) 807 return data 808 809 def readline(self, size=None): 810 811 if size is None: 812 data = self.reader.readline() 813 else: 814 data = self.reader.readline(size) 815 data, bytesencoded = self.encode(data, self.errors) 816 return data 817 818 def readlines(self, sizehint=None): 819 820 data = self.reader.read() 821 data, bytesencoded = self.encode(data, self.errors) 822 return data.splitlines(keepends=True) 823 824 def __next__(self): 825 826 """ Return the next decoded line from the input stream.""" 827 data = next(self.reader) 828 data, bytesencoded = self.encode(data, self.errors) 829 return data 830 831 def __iter__(self): 832 return self 833 834 def write(self, data): 835 836 data, bytesdecoded = self.decode(data, self.errors) 837 return self.writer.write(data) 838 839 def writelines(self, list): 840 841 data = b''.join(list) 842 data, bytesdecoded = self.decode(data, self.errors) 843 return self.writer.write(data) 844 845 def reset(self): 846 847 self.reader.reset() 848 self.writer.reset() 849 850 def seek(self, offset, whence=0): 851 # Seeks must be propagated to both the readers and writers 852 # as they might need to reset their internal buffers. 853 self.reader.seek(offset, whence) 854 self.writer.seek(offset, whence) 855 856 def __getattr__(self, name, 857 getattr=getattr): 858 859 """ Inherit all other methods from the underlying stream. 860 """ 861 return getattr(self.stream, name) 862 863 def __enter__(self): 864 return self 865 866 def __exit__(self, type, value, tb): 867 self.stream.close() 868 869### Shortcuts 870 871def open(filename, mode='r', encoding=None, errors='strict', buffering=-1): 872 873 """ Open an encoded file using the given mode and return 874 a wrapped version providing transparent encoding/decoding. 875 876 Note: The wrapped version will only accept the object format 877 defined by the codecs, i.e. Unicode objects for most builtin 878 codecs. Output is also codec dependent and will usually be 879 Unicode as well. 880 881 Underlying encoded files are always opened in binary mode. 882 The default file mode is 'r', meaning to open the file in read mode. 883 884 encoding specifies the encoding which is to be used for the 885 file. 886 887 errors may be given to define the error handling. It defaults 888 to 'strict' which causes ValueErrors to be raised in case an 889 encoding error occurs. 890 891 buffering has the same meaning as for the builtin open() API. 892 It defaults to -1 which means that the default buffer size will 893 be used. 894 895 The returned wrapped file object provides an extra attribute 896 .encoding which allows querying the used encoding. This 897 attribute is only available if an encoding was specified as 898 parameter. 899 900 """ 901 if encoding is not None and \ 902 'b' not in mode: 903 # Force opening of the file in binary mode 904 mode = mode + 'b' 905 file = builtins.open(filename, mode, buffering) 906 if encoding is None: 907 return file 908 909 try: 910 info = lookup(encoding) 911 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 912 # Add attributes to simplify introspection 913 srw.encoding = encoding 914 return srw 915 except: 916 file.close() 917 raise 918 919def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 920 921 """ Return a wrapped version of file which provides transparent 922 encoding translation. 923 924 Data written to the wrapped file is decoded according 925 to the given data_encoding and then encoded to the underlying 926 file using file_encoding. The intermediate data type 927 will usually be Unicode but depends on the specified codecs. 928 929 Bytes read from the file are decoded using file_encoding and then 930 passed back to the caller encoded using data_encoding. 931 932 If file_encoding is not given, it defaults to data_encoding. 933 934 errors may be given to define the error handling. It defaults 935 to 'strict' which causes ValueErrors to be raised in case an 936 encoding error occurs. 937 938 The returned wrapped file object provides two extra attributes 939 .data_encoding and .file_encoding which reflect the given 940 parameters of the same name. The attributes can be used for 941 introspection by Python programs. 942 943 """ 944 if file_encoding is None: 945 file_encoding = data_encoding 946 data_info = lookup(data_encoding) 947 file_info = lookup(file_encoding) 948 sr = StreamRecoder(file, data_info.encode, data_info.decode, 949 file_info.streamreader, file_info.streamwriter, errors) 950 # Add attributes to simplify introspection 951 sr.data_encoding = data_encoding 952 sr.file_encoding = file_encoding 953 return sr 954 955### Helpers for codec lookup 956 957def getencoder(encoding): 958 959 """ Lookup up the codec for the given encoding and return 960 its encoder function. 961 962 Raises a LookupError in case the encoding cannot be found. 963 964 """ 965 return lookup(encoding).encode 966 967def getdecoder(encoding): 968 969 """ Lookup up the codec for the given encoding and return 970 its decoder function. 971 972 Raises a LookupError in case the encoding cannot be found. 973 974 """ 975 return lookup(encoding).decode 976 977def getincrementalencoder(encoding): 978 979 """ Lookup up the codec for the given encoding and return 980 its IncrementalEncoder class or factory function. 981 982 Raises a LookupError in case the encoding cannot be found 983 or the codecs doesn't provide an incremental encoder. 984 985 """ 986 encoder = lookup(encoding).incrementalencoder 987 if encoder is None: 988 raise LookupError(encoding) 989 return encoder 990 991def getincrementaldecoder(encoding): 992 993 """ Lookup up the codec for the given encoding and return 994 its IncrementalDecoder class or factory function. 995 996 Raises a LookupError in case the encoding cannot be found 997 or the codecs doesn't provide an incremental decoder. 998 999 """ 1000 decoder = lookup(encoding).incrementaldecoder 1001 if decoder is None: 1002 raise LookupError(encoding) 1003 return decoder 1004 1005def getreader(encoding): 1006 1007 """ Lookup up the codec for the given encoding and return 1008 its StreamReader class or factory function. 1009 1010 Raises a LookupError in case the encoding cannot be found. 1011 1012 """ 1013 return lookup(encoding).streamreader 1014 1015def getwriter(encoding): 1016 1017 """ Lookup up the codec for the given encoding and return 1018 its StreamWriter class or factory function. 1019 1020 Raises a LookupError in case the encoding cannot be found. 1021 1022 """ 1023 return lookup(encoding).streamwriter 1024 1025def iterencode(iterator, encoding, errors='strict', **kwargs): 1026 """ 1027 Encoding iterator. 1028 1029 Encodes the input strings from the iterator using an IncrementalEncoder. 1030 1031 errors and kwargs are passed through to the IncrementalEncoder 1032 constructor. 1033 """ 1034 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1035 for input in iterator: 1036 output = encoder.encode(input) 1037 if output: 1038 yield output 1039 output = encoder.encode("", True) 1040 if output: 1041 yield output 1042 1043def iterdecode(iterator, encoding, errors='strict', **kwargs): 1044 """ 1045 Decoding iterator. 1046 1047 Decodes the input strings from the iterator using an IncrementalDecoder. 1048 1049 errors and kwargs are passed through to the IncrementalDecoder 1050 constructor. 1051 """ 1052 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1053 for input in iterator: 1054 output = decoder.decode(input) 1055 if output: 1056 yield output 1057 output = decoder.decode(b"", True) 1058 if output: 1059 yield output 1060 1061### Helpers for charmap-based codecs 1062 1063def make_identity_dict(rng): 1064 1065 """ make_identity_dict(rng) -> dict 1066 1067 Return a dictionary where elements of the rng sequence are 1068 mapped to themselves. 1069 1070 """ 1071 return {i:i for i in rng} 1072 1073def make_encoding_map(decoding_map): 1074 1075 """ Creates an encoding map from a decoding map. 1076 1077 If a target mapping in the decoding map occurs multiple 1078 times, then that target is mapped to None (undefined mapping), 1079 causing an exception when encountered by the charmap codec 1080 during translation. 1081 1082 One example where this happens is cp875.py which decodes 1083 multiple character to \\u001a. 1084 1085 """ 1086 m = {} 1087 for k,v in decoding_map.items(): 1088 if not v in m: 1089 m[v] = k 1090 else: 1091 m[v] = None 1092 return m 1093 1094### error handlers 1095 1096try: 1097 strict_errors = lookup_error("strict") 1098 ignore_errors = lookup_error("ignore") 1099 replace_errors = lookup_error("replace") 1100 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1101 backslashreplace_errors = lookup_error("backslashreplace") 1102 namereplace_errors = lookup_error("namereplace") 1103except LookupError: 1104 # In --disable-unicode builds, these error handler are missing 1105 strict_errors = None 1106 ignore_errors = None 1107 replace_errors = None 1108 xmlcharrefreplace_errors = None 1109 backslashreplace_errors = None 1110 namereplace_errors = None 1111 1112# Tell modulefinder that using codecs probably needs the encodings 1113# package 1114_false = 0 1115if _false: 1116 import encodings 1117 1118### Tests 1119 1120if __name__ == '__main__': 1121 1122 # Make stdout translate Latin-1 output into UTF-8 output 1123 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1124 1125 # Have stdin translate Latin-1 input into UTF-8 input 1126 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1127