• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""
9
10import builtins
11import sys
12
13### Registry and builtin stateless codec functions
14
15try:
16    from _codecs import *
17except ImportError as why:
18    raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25           "StreamReader", "StreamWriter",
26           "StreamReaderWriter", "StreamRecoder",
27           "getencoder", "getdecoder", "getincrementalencoder",
28           "getincrementaldecoder", "getreader", "getwriter",
29           "encode", "decode", "iterencode", "iterdecode",
30           "strict_errors", "ignore_errors", "replace_errors",
31           "xmlcharrefreplace_errors",
32           "backslashreplace_errors", "namereplace_errors",
33           "register_error", "lookup_error"]
34
35### Constants
36
37#
38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
41#
42
43# UTF-8
44BOM_UTF8 = b'\xef\xbb\xbf'
45
46# UTF-16, little endian
47BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49# UTF-16, big endian
50BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52# UTF-32, little endian
53BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55# UTF-32, big endian
56BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58if sys.byteorder == 'little':
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_LE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68    # UTF-16, native endianness
69    BOM = BOM_UTF16 = BOM_UTF16_BE
70
71    # UTF-32, native endianness
72    BOM_UTF32 = BOM_UTF32_BE
73
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
79
80
81### Codec base classes (defining the API)
82
83class CodecInfo(tuple):
84    """Codec details when looking up the codec registry"""
85
86    # Private API to allow Python 3.4 to blacklist the known non-Unicode
87    # codecs in the standard library. A more general mechanism to
88    # reliably distinguish test encodings from other codecs will hopefully
89    # be defined for Python 3.5
90    #
91    # See http://bugs.python.org/issue19619
92    _is_text_encoding = True # Assume codecs are text encodings by default
93
94    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95        incrementalencoder=None, incrementaldecoder=None, name=None,
96        *, _is_text_encoding=None):
97        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98        self.name = name
99        self.encode = encode
100        self.decode = decode
101        self.incrementalencoder = incrementalencoder
102        self.incrementaldecoder = incrementaldecoder
103        self.streamwriter = streamwriter
104        self.streamreader = streamreader
105        if _is_text_encoding is not None:
106            self._is_text_encoding = _is_text_encoding
107        return self
108
109    def __repr__(self):
110        return "<%s.%s object for encoding %s at %#x>" % \
111                (self.__class__.__module__, self.__class__.__qualname__,
112                 self.name, id(self))
113
114class Codec:
115
116    """ Defines the interface for stateless encoders/decoders.
117
118        The .encode()/.decode() methods may use different error
119        handling schemes by providing the errors argument. These
120        string values are predefined:
121
122         'strict' - raise a ValueError error (or a subclass)
123         'ignore' - ignore the character and continue with the next
124         'replace' - replace with a suitable replacement character;
125                    Python will use the official U+FFFD REPLACEMENT
126                    CHARACTER for the builtin Unicode codecs on
127                    decoding and '?' on encoding.
128         'surrogateescape' - replace with private code points U+DCnn.
129         'xmlcharrefreplace' - Replace with the appropriate XML
130                               character reference (only for encoding).
131         'backslashreplace'  - Replace with backslashed escape sequences.
132         'namereplace'       - Replace with \\N{...} escape sequences
133                               (only for encoding).
134
135        The set of allowed values can be extended via register_error.
136
137    """
138    def encode(self, input, errors='strict'):
139
140        """ Encodes the object input and returns a tuple (output
141            object, length consumed).
142
143            errors defines the error handling to apply. It defaults to
144            'strict' handling.
145
146            The method may not store state in the Codec instance. Use
147            StreamWriter for codecs which have to keep state in order to
148            make encoding efficient.
149
150            The encoder must be able to handle zero length input and
151            return an empty object of the output object type in this
152            situation.
153
154        """
155        raise NotImplementedError
156
157    def decode(self, input, errors='strict'):
158
159        """ Decodes the object input and returns a tuple (output
160            object, length consumed).
161
162            input must be an object which provides the bf_getreadbuf
163            buffer slot. Python strings, buffer objects and memory
164            mapped files are examples of objects providing this slot.
165
166            errors defines the error handling to apply. It defaults to
167            'strict' handling.
168
169            The method may not store state in the Codec instance. Use
170            StreamReader for codecs which have to keep state in order to
171            make decoding efficient.
172
173            The decoder must be able to handle zero length input and
174            return an empty object of the output object type in this
175            situation.
176
177        """
178        raise NotImplementedError
179
180class IncrementalEncoder(object):
181    """
182    An IncrementalEncoder encodes an input in multiple steps. The input can
183    be passed piece by piece to the encode() method. The IncrementalEncoder
184    remembers the state of the encoding process between calls to encode().
185    """
186    def __init__(self, errors='strict'):
187        """
188        Creates an IncrementalEncoder instance.
189
190        The IncrementalEncoder may use different error handling schemes by
191        providing the errors keyword argument. See the module docstring
192        for a list of possible values.
193        """
194        self.errors = errors
195        self.buffer = ""
196
197    def encode(self, input, final=False):
198        """
199        Encodes input and returns the resulting object.
200        """
201        raise NotImplementedError
202
203    def reset(self):
204        """
205        Resets the encoder to the initial state.
206        """
207
208    def getstate(self):
209        """
210        Return the current state of the encoder.
211        """
212        return 0
213
214    def setstate(self, state):
215        """
216        Set the current state of the encoder. state must have been
217        returned by getstate().
218        """
219
220class BufferedIncrementalEncoder(IncrementalEncoder):
221    """
222    This subclass of IncrementalEncoder can be used as the baseclass for an
223    incremental encoder if the encoder must keep some of the output in a
224    buffer between calls to encode().
225    """
226    def __init__(self, errors='strict'):
227        IncrementalEncoder.__init__(self, errors)
228        # unencoded input that is kept between calls to encode()
229        self.buffer = ""
230
231    def _buffer_encode(self, input, errors, final):
232        # Overwrite this method in subclasses: It must encode input
233        # and return an (output, length consumed) tuple
234        raise NotImplementedError
235
236    def encode(self, input, final=False):
237        # encode input (taking the buffer into account)
238        data = self.buffer + input
239        (result, consumed) = self._buffer_encode(data, self.errors, final)
240        # keep unencoded input until the next call
241        self.buffer = data[consumed:]
242        return result
243
244    def reset(self):
245        IncrementalEncoder.reset(self)
246        self.buffer = ""
247
248    def getstate(self):
249        return self.buffer or 0
250
251    def setstate(self, state):
252        self.buffer = state or ""
253
254class IncrementalDecoder(object):
255    """
256    An IncrementalDecoder decodes an input in multiple steps. The input can
257    be passed piece by piece to the decode() method. The IncrementalDecoder
258    remembers the state of the decoding process between calls to decode().
259    """
260    def __init__(self, errors='strict'):
261        """
262        Create an IncrementalDecoder instance.
263
264        The IncrementalDecoder may use different error handling schemes by
265        providing the errors keyword argument. See the module docstring
266        for a list of possible values.
267        """
268        self.errors = errors
269
270    def decode(self, input, final=False):
271        """
272        Decode input and returns the resulting object.
273        """
274        raise NotImplementedError
275
276    def reset(self):
277        """
278        Reset the decoder to the initial state.
279        """
280
281    def getstate(self):
282        """
283        Return the current state of the decoder.
284
285        This must be a (buffered_input, additional_state_info) tuple.
286        buffered_input must be a bytes object containing bytes that
287        were passed to decode() that have not yet been converted.
288        additional_state_info must be a non-negative integer
289        representing the state of the decoder WITHOUT yet having
290        processed the contents of buffered_input.  In the initial state
291        and after reset(), getstate() must return (b"", 0).
292        """
293        return (b"", 0)
294
295    def setstate(self, state):
296        """
297        Set the current state of the decoder.
298
299        state must have been returned by getstate().  The effect of
300        setstate((b"", 0)) must be equivalent to reset().
301        """
302
303class BufferedIncrementalDecoder(IncrementalDecoder):
304    """
305    This subclass of IncrementalDecoder can be used as the baseclass for an
306    incremental decoder if the decoder must be able to handle incomplete
307    byte sequences.
308    """
309    def __init__(self, errors='strict'):
310        IncrementalDecoder.__init__(self, errors)
311        # undecoded input that is kept between calls to decode()
312        self.buffer = b""
313
314    def _buffer_decode(self, input, errors, final):
315        # Overwrite this method in subclasses: It must decode input
316        # and return an (output, length consumed) tuple
317        raise NotImplementedError
318
319    def decode(self, input, final=False):
320        # decode input (taking the buffer into account)
321        data = self.buffer + input
322        (result, consumed) = self._buffer_decode(data, self.errors, final)
323        # keep undecoded input until the next call
324        self.buffer = data[consumed:]
325        return result
326
327    def reset(self):
328        IncrementalDecoder.reset(self)
329        self.buffer = b""
330
331    def getstate(self):
332        # additional state info is always 0
333        return (self.buffer, 0)
334
335    def setstate(self, state):
336        # ignore additional state info
337        self.buffer = state[0]
338
339#
340# The StreamWriter and StreamReader class provide generic working
341# interfaces which can be used to implement new encoding submodules
342# very easily. See encodings/utf_8.py for an example on how this is
343# done.
344#
345
346class StreamWriter(Codec):
347
348    def __init__(self, stream, errors='strict'):
349
350        """ Creates a StreamWriter instance.
351
352            stream must be a file-like object open for writing.
353
354            The StreamWriter may use different error handling
355            schemes by providing the errors keyword argument. These
356            parameters are predefined:
357
358             'strict' - raise a ValueError (or a subclass)
359             'ignore' - ignore the character and continue with the next
360             'replace'- replace with a suitable replacement character
361             'xmlcharrefreplace' - Replace with the appropriate XML
362                                   character reference.
363             'backslashreplace'  - Replace with backslashed escape
364                                   sequences.
365             'namereplace'       - Replace with \\N{...} escape sequences.
366
367            The set of allowed parameter values can be extended via
368            register_error.
369        """
370        self.stream = stream
371        self.errors = errors
372
373    def write(self, object):
374
375        """ Writes the object's contents encoded to self.stream.
376        """
377        data, consumed = self.encode(object, self.errors)
378        self.stream.write(data)
379
380    def writelines(self, list):
381
382        """ Writes the concatenated list of strings to the stream
383            using .write().
384        """
385        self.write(''.join(list))
386
387    def reset(self):
388
389        """ Flushes and resets the codec buffers used for keeping state.
390
391            Calling this method should ensure that the data on the
392            output is put into a clean state, that allows appending
393            of new fresh data without having to rescan the whole
394            stream to recover state.
395
396        """
397        pass
398
399    def seek(self, offset, whence=0):
400        self.stream.seek(offset, whence)
401        if whence == 0 and offset == 0:
402            self.reset()
403
404    def __getattr__(self, name,
405                    getattr=getattr):
406
407        """ Inherit all other methods from the underlying stream.
408        """
409        return getattr(self.stream, name)
410
411    def __enter__(self):
412        return self
413
414    def __exit__(self, type, value, tb):
415        self.stream.close()
416
417###
418
419class StreamReader(Codec):
420
421    charbuffertype = str
422
423    def __init__(self, stream, errors='strict'):
424
425        """ Creates a StreamReader instance.
426
427            stream must be a file-like object open for reading.
428
429            The StreamReader may use different error handling
430            schemes by providing the errors keyword argument. These
431            parameters are predefined:
432
433             'strict' - raise a ValueError (or a subclass)
434             'ignore' - ignore the character and continue with the next
435             'replace'- replace with a suitable replacement character
436             'backslashreplace' - Replace with backslashed escape sequences;
437
438            The set of allowed parameter values can be extended via
439            register_error.
440        """
441        self.stream = stream
442        self.errors = errors
443        self.bytebuffer = b""
444        self._empty_charbuffer = self.charbuffertype()
445        self.charbuffer = self._empty_charbuffer
446        self.linebuffer = None
447
448    def decode(self, input, errors='strict'):
449        raise NotImplementedError
450
451    def read(self, size=-1, chars=-1, firstline=False):
452
453        """ Decodes data from the stream self.stream and returns the
454            resulting object.
455
456            chars indicates the number of decoded code points or bytes to
457            return. read() will never return more data than requested,
458            but it might return less, if there is not enough available.
459
460            size indicates the approximate maximum number of decoded
461            bytes or code points to read for decoding. The decoder
462            can modify this setting as appropriate. The default value
463            -1 indicates to read and decode as much as possible.  size
464            is intended to prevent having to decode huge files in one
465            step.
466
467            If firstline is true, and a UnicodeDecodeError happens
468            after the first line terminator in the input only the first line
469            will be returned, the rest of the input will be kept until the
470            next call to read().
471
472            The method should use a greedy read strategy, meaning that
473            it should read as much data as is allowed within the
474            definition of the encoding and the given size, e.g.  if
475            optional encoding endings or state markers are available
476            on the stream, these should be read too.
477        """
478        # If we have lines cached, first merge them back into characters
479        if self.linebuffer:
480            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481            self.linebuffer = None
482
483        if chars < 0:
484            # For compatibility with other read() methods that take a
485            # single argument
486            chars = size
487
488        # read until we get the required number of characters (if available)
489        while True:
490            # can the request be satisfied from the character buffer?
491            if chars >= 0:
492                if len(self.charbuffer) >= chars:
493                    break
494            # we need more data
495            if size < 0:
496                newdata = self.stream.read()
497            else:
498                newdata = self.stream.read(size)
499            # decode bytes (those remaining from the last call included)
500            data = self.bytebuffer + newdata
501            if not data:
502                break
503            try:
504                newchars, decodedbytes = self.decode(data, self.errors)
505            except UnicodeDecodeError as exc:
506                if firstline:
507                    newchars, decodedbytes = \
508                        self.decode(data[:exc.start], self.errors)
509                    lines = newchars.splitlines(keepends=True)
510                    if len(lines)<=1:
511                        raise
512                else:
513                    raise
514            # keep undecoded bytes until the next call
515            self.bytebuffer = data[decodedbytes:]
516            # put new characters in the character buffer
517            self.charbuffer += newchars
518            # there was no data available
519            if not newdata:
520                break
521        if chars < 0:
522            # Return everything we've got
523            result = self.charbuffer
524            self.charbuffer = self._empty_charbuffer
525        else:
526            # Return the first chars characters
527            result = self.charbuffer[:chars]
528            self.charbuffer = self.charbuffer[chars:]
529        return result
530
531    def readline(self, size=None, keepends=True):
532
533        """ Read one line from the input stream and return the
534            decoded data.
535
536            size, if given, is passed as size argument to the
537            read() method.
538
539        """
540        # If we have lines cached from an earlier read, return
541        # them unconditionally
542        if self.linebuffer:
543            line = self.linebuffer[0]
544            del self.linebuffer[0]
545            if len(self.linebuffer) == 1:
546                # revert to charbuffer mode; we might need more data
547                # next time
548                self.charbuffer = self.linebuffer[0]
549                self.linebuffer = None
550            if not keepends:
551                line = line.splitlines(keepends=False)[0]
552            return line
553
554        readsize = size or 72
555        line = self._empty_charbuffer
556        # If size is given, we call read() only once
557        while True:
558            data = self.read(readsize, firstline=True)
559            if data:
560                # If we're at a "\r" read one extra character (which might
561                # be a "\n") to get a proper line ending. If the stream is
562                # temporarily exhausted we return the wrong line ending.
563                if (isinstance(data, str) and data.endswith("\r")) or \
564                   (isinstance(data, bytes) and data.endswith(b"\r")):
565                    data += self.read(size=1, chars=1)
566
567            line += data
568            lines = line.splitlines(keepends=True)
569            if lines:
570                if len(lines) > 1:
571                    # More than one line result; the first line is a full line
572                    # to return
573                    line = lines[0]
574                    del lines[0]
575                    if len(lines) > 1:
576                        # cache the remaining lines
577                        lines[-1] += self.charbuffer
578                        self.linebuffer = lines
579                        self.charbuffer = None
580                    else:
581                        # only one remaining line, put it back into charbuffer
582                        self.charbuffer = lines[0] + self.charbuffer
583                    if not keepends:
584                        line = line.splitlines(keepends=False)[0]
585                    break
586                line0withend = lines[0]
587                line0withoutend = lines[0].splitlines(keepends=False)[0]
588                if line0withend != line0withoutend: # We really have a line end
589                    # Put the rest back together and keep it until the next call
590                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591                                      self.charbuffer
592                    if keepends:
593                        line = line0withend
594                    else:
595                        line = line0withoutend
596                    break
597            # we didn't get anything or this was our only try
598            if not data or size is not None:
599                if line and not keepends:
600                    line = line.splitlines(keepends=False)[0]
601                break
602            if readsize < 8000:
603                readsize *= 2
604        return line
605
606    def readlines(self, sizehint=None, keepends=True):
607
608        """ Read all lines available on the input stream
609            and return them as a list.
610
611            Line breaks are implemented using the codec's decoder
612            method and are included in the list entries.
613
614            sizehint, if given, is ignored since there is no efficient
615            way to finding the true end-of-line.
616
617        """
618        data = self.read()
619        return data.splitlines(keepends)
620
621    def reset(self):
622
623        """ Resets the codec buffers used for keeping state.
624
625            Note that no stream repositioning should take place.
626            This method is primarily intended to be able to recover
627            from decoding errors.
628
629        """
630        self.bytebuffer = b""
631        self.charbuffer = self._empty_charbuffer
632        self.linebuffer = None
633
634    def seek(self, offset, whence=0):
635        """ Set the input stream's current position.
636
637            Resets the codec buffers used for keeping state.
638        """
639        self.stream.seek(offset, whence)
640        self.reset()
641
642    def __next__(self):
643
644        """ Return the next decoded line from the input stream."""
645        line = self.readline()
646        if line:
647            return line
648        raise StopIteration
649
650    def __iter__(self):
651        return self
652
653    def __getattr__(self, name,
654                    getattr=getattr):
655
656        """ Inherit all other methods from the underlying stream.
657        """
658        return getattr(self.stream, name)
659
660    def __enter__(self):
661        return self
662
663    def __exit__(self, type, value, tb):
664        self.stream.close()
665
666###
667
668class StreamReaderWriter:
669
670    """ StreamReaderWriter instances allow wrapping streams which
671        work in both read and write modes.
672
673        The design is such that one can use the factory functions
674        returned by the codec.lookup() function to construct the
675        instance.
676
677    """
678    # Optional attributes set by the file wrappers below
679    encoding = 'unknown'
680
681    def __init__(self, stream, Reader, Writer, errors='strict'):
682
683        """ Creates a StreamReaderWriter instance.
684
685            stream must be a Stream-like object.
686
687            Reader, Writer must be factory functions or classes
688            providing the StreamReader, StreamWriter interface resp.
689
690            Error handling is done in the same way as defined for the
691            StreamWriter/Readers.
692
693        """
694        self.stream = stream
695        self.reader = Reader(stream, errors)
696        self.writer = Writer(stream, errors)
697        self.errors = errors
698
699    def read(self, size=-1):
700
701        return self.reader.read(size)
702
703    def readline(self, size=None):
704
705        return self.reader.readline(size)
706
707    def readlines(self, sizehint=None):
708
709        return self.reader.readlines(sizehint)
710
711    def __next__(self):
712
713        """ Return the next decoded line from the input stream."""
714        return next(self.reader)
715
716    def __iter__(self):
717        return self
718
719    def write(self, data):
720
721        return self.writer.write(data)
722
723    def writelines(self, list):
724
725        return self.writer.writelines(list)
726
727    def reset(self):
728
729        self.reader.reset()
730        self.writer.reset()
731
732    def seek(self, offset, whence=0):
733        self.stream.seek(offset, whence)
734        self.reader.reset()
735        if whence == 0 and offset == 0:
736            self.writer.reset()
737
738    def __getattr__(self, name,
739                    getattr=getattr):
740
741        """ Inherit all other methods from the underlying stream.
742        """
743        return getattr(self.stream, name)
744
745    # these are needed to make "with StreamReaderWriter(...)" work properly
746
747    def __enter__(self):
748        return self
749
750    def __exit__(self, type, value, tb):
751        self.stream.close()
752
753###
754
755class StreamRecoder:
756
757    """ StreamRecoder instances translate data from one encoding to another.
758
759        They use the complete set of APIs returned by the
760        codecs.lookup() function to implement their task.
761
762        Data written to the StreamRecoder is first decoded into an
763        intermediate format (depending on the "decode" codec) and then
764        written to the underlying stream using an instance of the provided
765        Writer class.
766
767        In the other direction, data is read from the underlying stream using
768        a Reader instance and then encoded and returned to the caller.
769
770    """
771    # Optional attributes set by the file wrappers below
772    data_encoding = 'unknown'
773    file_encoding = 'unknown'
774
775    def __init__(self, stream, encode, decode, Reader, Writer,
776                 errors='strict'):
777
778        """ Creates a StreamRecoder instance which implements a two-way
779            conversion: encode and decode work on the frontend (the
780            data visible to .read() and .write()) while Reader and Writer
781            work on the backend (the data in stream).
782
783            You can use these objects to do transparent
784            transcodings from e.g. latin-1 to utf-8 and back.
785
786            stream must be a file-like object.
787
788            encode and decode must adhere to the Codec interface; Reader and
789            Writer must be factory functions or classes providing the
790            StreamReader and StreamWriter interfaces resp.
791
792            Error handling is done in the same way as defined for the
793            StreamWriter/Readers.
794
795        """
796        self.stream = stream
797        self.encode = encode
798        self.decode = decode
799        self.reader = Reader(stream, errors)
800        self.writer = Writer(stream, errors)
801        self.errors = errors
802
803    def read(self, size=-1):
804
805        data = self.reader.read(size)
806        data, bytesencoded = self.encode(data, self.errors)
807        return data
808
809    def readline(self, size=None):
810
811        if size is None:
812            data = self.reader.readline()
813        else:
814            data = self.reader.readline(size)
815        data, bytesencoded = self.encode(data, self.errors)
816        return data
817
818    def readlines(self, sizehint=None):
819
820        data = self.reader.read()
821        data, bytesencoded = self.encode(data, self.errors)
822        return data.splitlines(keepends=True)
823
824    def __next__(self):
825
826        """ Return the next decoded line from the input stream."""
827        data = next(self.reader)
828        data, bytesencoded = self.encode(data, self.errors)
829        return data
830
831    def __iter__(self):
832        return self
833
834    def write(self, data):
835
836        data, bytesdecoded = self.decode(data, self.errors)
837        return self.writer.write(data)
838
839    def writelines(self, list):
840
841        data = b''.join(list)
842        data, bytesdecoded = self.decode(data, self.errors)
843        return self.writer.write(data)
844
845    def reset(self):
846
847        self.reader.reset()
848        self.writer.reset()
849
850    def seek(self, offset, whence=0):
851        # Seeks must be propagated to both the readers and writers
852        # as they might need to reset their internal buffers.
853        self.reader.seek(offset, whence)
854        self.writer.seek(offset, whence)
855
856    def __getattr__(self, name,
857                    getattr=getattr):
858
859        """ Inherit all other methods from the underlying stream.
860        """
861        return getattr(self.stream, name)
862
863    def __enter__(self):
864        return self
865
866    def __exit__(self, type, value, tb):
867        self.stream.close()
868
869### Shortcuts
870
871def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
872
873    """ Open an encoded file using the given mode and return
874        a wrapped version providing transparent encoding/decoding.
875
876        Note: The wrapped version will only accept the object format
877        defined by the codecs, i.e. Unicode objects for most builtin
878        codecs. Output is also codec dependent and will usually be
879        Unicode as well.
880
881        Underlying encoded files are always opened in binary mode.
882        The default file mode is 'r', meaning to open the file in read mode.
883
884        encoding specifies the encoding which is to be used for the
885        file.
886
887        errors may be given to define the error handling. It defaults
888        to 'strict' which causes ValueErrors to be raised in case an
889        encoding error occurs.
890
891        buffering has the same meaning as for the builtin open() API.
892        It defaults to -1 which means that the default buffer size will
893        be used.
894
895        The returned wrapped file object provides an extra attribute
896        .encoding which allows querying the used encoding. This
897        attribute is only available if an encoding was specified as
898        parameter.
899
900    """
901    if encoding is not None and \
902       'b' not in mode:
903        # Force opening of the file in binary mode
904        mode = mode + 'b'
905    file = builtins.open(filename, mode, buffering)
906    if encoding is None:
907        return file
908    info = lookup(encoding)
909    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
910    # Add attributes to simplify introspection
911    srw.encoding = encoding
912    return srw
913
914def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
915
916    """ Return a wrapped version of file which provides transparent
917        encoding translation.
918
919        Data written to the wrapped file is decoded according
920        to the given data_encoding and then encoded to the underlying
921        file using file_encoding. The intermediate data type
922        will usually be Unicode but depends on the specified codecs.
923
924        Bytes read from the file are decoded using file_encoding and then
925        passed back to the caller encoded using data_encoding.
926
927        If file_encoding is not given, it defaults to data_encoding.
928
929        errors may be given to define the error handling. It defaults
930        to 'strict' which causes ValueErrors to be raised in case an
931        encoding error occurs.
932
933        The returned wrapped file object provides two extra attributes
934        .data_encoding and .file_encoding which reflect the given
935        parameters of the same name. The attributes can be used for
936        introspection by Python programs.
937
938    """
939    if file_encoding is None:
940        file_encoding = data_encoding
941    data_info = lookup(data_encoding)
942    file_info = lookup(file_encoding)
943    sr = StreamRecoder(file, data_info.encode, data_info.decode,
944                       file_info.streamreader, file_info.streamwriter, errors)
945    # Add attributes to simplify introspection
946    sr.data_encoding = data_encoding
947    sr.file_encoding = file_encoding
948    return sr
949
950### Helpers for codec lookup
951
952def getencoder(encoding):
953
954    """ Lookup up the codec for the given encoding and return
955        its encoder function.
956
957        Raises a LookupError in case the encoding cannot be found.
958
959    """
960    return lookup(encoding).encode
961
962def getdecoder(encoding):
963
964    """ Lookup up the codec for the given encoding and return
965        its decoder function.
966
967        Raises a LookupError in case the encoding cannot be found.
968
969    """
970    return lookup(encoding).decode
971
972def getincrementalencoder(encoding):
973
974    """ Lookup up the codec for the given encoding and return
975        its IncrementalEncoder class or factory function.
976
977        Raises a LookupError in case the encoding cannot be found
978        or the codecs doesn't provide an incremental encoder.
979
980    """
981    encoder = lookup(encoding).incrementalencoder
982    if encoder is None:
983        raise LookupError(encoding)
984    return encoder
985
986def getincrementaldecoder(encoding):
987
988    """ Lookup up the codec for the given encoding and return
989        its IncrementalDecoder class or factory function.
990
991        Raises a LookupError in case the encoding cannot be found
992        or the codecs doesn't provide an incremental decoder.
993
994    """
995    decoder = lookup(encoding).incrementaldecoder
996    if decoder is None:
997        raise LookupError(encoding)
998    return decoder
999
1000def getreader(encoding):
1001
1002    """ Lookup up the codec for the given encoding and return
1003        its StreamReader class or factory function.
1004
1005        Raises a LookupError in case the encoding cannot be found.
1006
1007    """
1008    return lookup(encoding).streamreader
1009
1010def getwriter(encoding):
1011
1012    """ Lookup up the codec for the given encoding and return
1013        its StreamWriter class or factory function.
1014
1015        Raises a LookupError in case the encoding cannot be found.
1016
1017    """
1018    return lookup(encoding).streamwriter
1019
1020def iterencode(iterator, encoding, errors='strict', **kwargs):
1021    """
1022    Encoding iterator.
1023
1024    Encodes the input strings from the iterator using an IncrementalEncoder.
1025
1026    errors and kwargs are passed through to the IncrementalEncoder
1027    constructor.
1028    """
1029    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1030    for input in iterator:
1031        output = encoder.encode(input)
1032        if output:
1033            yield output
1034    output = encoder.encode("", True)
1035    if output:
1036        yield output
1037
1038def iterdecode(iterator, encoding, errors='strict', **kwargs):
1039    """
1040    Decoding iterator.
1041
1042    Decodes the input strings from the iterator using an IncrementalDecoder.
1043
1044    errors and kwargs are passed through to the IncrementalDecoder
1045    constructor.
1046    """
1047    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1048    for input in iterator:
1049        output = decoder.decode(input)
1050        if output:
1051            yield output
1052    output = decoder.decode(b"", True)
1053    if output:
1054        yield output
1055
1056### Helpers for charmap-based codecs
1057
1058def make_identity_dict(rng):
1059
1060    """ make_identity_dict(rng) -> dict
1061
1062        Return a dictionary where elements of the rng sequence are
1063        mapped to themselves.
1064
1065    """
1066    return {i:i for i in rng}
1067
1068def make_encoding_map(decoding_map):
1069
1070    """ Creates an encoding map from a decoding map.
1071
1072        If a target mapping in the decoding map occurs multiple
1073        times, then that target is mapped to None (undefined mapping),
1074        causing an exception when encountered by the charmap codec
1075        during translation.
1076
1077        One example where this happens is cp875.py which decodes
1078        multiple character to \\u001a.
1079
1080    """
1081    m = {}
1082    for k,v in decoding_map.items():
1083        if not v in m:
1084            m[v] = k
1085        else:
1086            m[v] = None
1087    return m
1088
1089### error handlers
1090
1091try:
1092    strict_errors = lookup_error("strict")
1093    ignore_errors = lookup_error("ignore")
1094    replace_errors = lookup_error("replace")
1095    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1096    backslashreplace_errors = lookup_error("backslashreplace")
1097    namereplace_errors = lookup_error("namereplace")
1098except LookupError:
1099    # In --disable-unicode builds, these error handler are missing
1100    strict_errors = None
1101    ignore_errors = None
1102    replace_errors = None
1103    xmlcharrefreplace_errors = None
1104    backslashreplace_errors = None
1105    namereplace_errors = None
1106
1107# Tell modulefinder that using codecs probably needs the encodings
1108# package
1109_false = 0
1110if _false:
1111    import encodings
1112
1113### Tests
1114
1115if __name__ == '__main__':
1116
1117    # Make stdout translate Latin-1 output into UTF-8 output
1118    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1119
1120    # Have stdin translate Latin-1 input into UTF-8 input
1121    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1122