• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""
9
10import builtins
11import sys
12
13### Registry and builtin stateless codec functions
14
15try:
16    from _codecs import *
17except ImportError as why:
18    raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25           "StreamReader", "StreamWriter",
26           "StreamReaderWriter", "StreamRecoder",
27           "getencoder", "getdecoder", "getincrementalencoder",
28           "getincrementaldecoder", "getreader", "getwriter",
29           "encode", "decode", "iterencode", "iterdecode",
30           "strict_errors", "ignore_errors", "replace_errors",
31           "xmlcharrefreplace_errors",
32           "backslashreplace_errors", "namereplace_errors",
33           "register_error", "lookup_error"]
34
35### Constants
36
37#
38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
41#
42
43# UTF-8
44BOM_UTF8 = b'\xef\xbb\xbf'
45
46# UTF-16, little endian
47BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49# UTF-16, big endian
50BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52# UTF-32, little endian
53BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55# UTF-32, big endian
56BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58if sys.byteorder == 'little':
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_LE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68    # UTF-16, native endianness
69    BOM = BOM_UTF16 = BOM_UTF16_BE
70
71    # UTF-32, native endianness
72    BOM_UTF32 = BOM_UTF32_BE
73
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
79
80
81### Codec base classes (defining the API)
82
83class CodecInfo(tuple):
84    """Codec details when looking up the codec registry"""
85
86    # Private API to allow Python 3.4 to denylist the known non-Unicode
87    # codecs in the standard library. A more general mechanism to
88    # reliably distinguish test encodings from other codecs will hopefully
89    # be defined for Python 3.5
90    #
91    # See http://bugs.python.org/issue19619
92    _is_text_encoding = True # Assume codecs are text encodings by default
93
94    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95        incrementalencoder=None, incrementaldecoder=None, name=None,
96        *, _is_text_encoding=None):
97        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98        self.name = name
99        self.encode = encode
100        self.decode = decode
101        self.incrementalencoder = incrementalencoder
102        self.incrementaldecoder = incrementaldecoder
103        self.streamwriter = streamwriter
104        self.streamreader = streamreader
105        if _is_text_encoding is not None:
106            self._is_text_encoding = _is_text_encoding
107        return self
108
109    def __repr__(self):
110        return "<%s.%s object for encoding %s at %#x>" % \
111                (self.__class__.__module__, self.__class__.__qualname__,
112                 self.name, id(self))
113
114    def __getnewargs__(self):
115        return tuple(self)
116
117class Codec:
118
119    """ Defines the interface for stateless encoders/decoders.
120
121        The .encode()/.decode() methods may use different error
122        handling schemes by providing the errors argument. These
123        string values are predefined:
124
125         'strict' - raise a ValueError error (or a subclass)
126         'ignore' - ignore the character and continue with the next
127         'replace' - replace with a suitable replacement character;
128                    Python will use the official U+FFFD REPLACEMENT
129                    CHARACTER for the builtin Unicode codecs on
130                    decoding and '?' on encoding.
131         'surrogateescape' - replace with private code points U+DCnn.
132         'xmlcharrefreplace' - Replace with the appropriate XML
133                               character reference (only for encoding).
134         'backslashreplace'  - Replace with backslashed escape sequences.
135         'namereplace'       - Replace with \\N{...} escape sequences
136                               (only for encoding).
137
138        The set of allowed values can be extended via register_error.
139
140    """
141    def encode(self, input, errors='strict'):
142
143        """ Encodes the object input and returns a tuple (output
144            object, length consumed).
145
146            errors defines the error handling to apply. It defaults to
147            'strict' handling.
148
149            The method may not store state in the Codec instance. Use
150            StreamWriter for codecs which have to keep state in order to
151            make encoding efficient.
152
153            The encoder must be able to handle zero length input and
154            return an empty object of the output object type in this
155            situation.
156
157        """
158        raise NotImplementedError
159
160    def decode(self, input, errors='strict'):
161
162        """ Decodes the object input and returns a tuple (output
163            object, length consumed).
164
165            input must be an object which provides the bf_getreadbuf
166            buffer slot. Python strings, buffer objects and memory
167            mapped files are examples of objects providing this slot.
168
169            errors defines the error handling to apply. It defaults to
170            'strict' handling.
171
172            The method may not store state in the Codec instance. Use
173            StreamReader for codecs which have to keep state in order to
174            make decoding efficient.
175
176            The decoder must be able to handle zero length input and
177            return an empty object of the output object type in this
178            situation.
179
180        """
181        raise NotImplementedError
182
183class IncrementalEncoder(object):
184    """
185    An IncrementalEncoder encodes an input in multiple steps. The input can
186    be passed piece by piece to the encode() method. The IncrementalEncoder
187    remembers the state of the encoding process between calls to encode().
188    """
189    def __init__(self, errors='strict'):
190        """
191        Creates an IncrementalEncoder instance.
192
193        The IncrementalEncoder may use different error handling schemes by
194        providing the errors keyword argument. See the module docstring
195        for a list of possible values.
196        """
197        self.errors = errors
198        self.buffer = ""
199
200    def encode(self, input, final=False):
201        """
202        Encodes input and returns the resulting object.
203        """
204        raise NotImplementedError
205
206    def reset(self):
207        """
208        Resets the encoder to the initial state.
209        """
210
211    def getstate(self):
212        """
213        Return the current state of the encoder.
214        """
215        return 0
216
217    def setstate(self, state):
218        """
219        Set the current state of the encoder. state must have been
220        returned by getstate().
221        """
222
223class BufferedIncrementalEncoder(IncrementalEncoder):
224    """
225    This subclass of IncrementalEncoder can be used as the baseclass for an
226    incremental encoder if the encoder must keep some of the output in a
227    buffer between calls to encode().
228    """
229    def __init__(self, errors='strict'):
230        IncrementalEncoder.__init__(self, errors)
231        # unencoded input that is kept between calls to encode()
232        self.buffer = ""
233
234    def _buffer_encode(self, input, errors, final):
235        # Overwrite this method in subclasses: It must encode input
236        # and return an (output, length consumed) tuple
237        raise NotImplementedError
238
239    def encode(self, input, final=False):
240        # encode input (taking the buffer into account)
241        data = self.buffer + input
242        (result, consumed) = self._buffer_encode(data, self.errors, final)
243        # keep unencoded input until the next call
244        self.buffer = data[consumed:]
245        return result
246
247    def reset(self):
248        IncrementalEncoder.reset(self)
249        self.buffer = ""
250
251    def getstate(self):
252        return self.buffer or 0
253
254    def setstate(self, state):
255        self.buffer = state or ""
256
257class IncrementalDecoder(object):
258    """
259    An IncrementalDecoder decodes an input in multiple steps. The input can
260    be passed piece by piece to the decode() method. The IncrementalDecoder
261    remembers the state of the decoding process between calls to decode().
262    """
263    def __init__(self, errors='strict'):
264        """
265        Create an IncrementalDecoder instance.
266
267        The IncrementalDecoder may use different error handling schemes by
268        providing the errors keyword argument. See the module docstring
269        for a list of possible values.
270        """
271        self.errors = errors
272
273    def decode(self, input, final=False):
274        """
275        Decode input and returns the resulting object.
276        """
277        raise NotImplementedError
278
279    def reset(self):
280        """
281        Reset the decoder to the initial state.
282        """
283
284    def getstate(self):
285        """
286        Return the current state of the decoder.
287
288        This must be a (buffered_input, additional_state_info) tuple.
289        buffered_input must be a bytes object containing bytes that
290        were passed to decode() that have not yet been converted.
291        additional_state_info must be a non-negative integer
292        representing the state of the decoder WITHOUT yet having
293        processed the contents of buffered_input.  In the initial state
294        and after reset(), getstate() must return (b"", 0).
295        """
296        return (b"", 0)
297
298    def setstate(self, state):
299        """
300        Set the current state of the decoder.
301
302        state must have been returned by getstate().  The effect of
303        setstate((b"", 0)) must be equivalent to reset().
304        """
305
306class BufferedIncrementalDecoder(IncrementalDecoder):
307    """
308    This subclass of IncrementalDecoder can be used as the baseclass for an
309    incremental decoder if the decoder must be able to handle incomplete
310    byte sequences.
311    """
312    def __init__(self, errors='strict'):
313        IncrementalDecoder.__init__(self, errors)
314        # undecoded input that is kept between calls to decode()
315        self.buffer = b""
316
317    def _buffer_decode(self, input, errors, final):
318        # Overwrite this method in subclasses: It must decode input
319        # and return an (output, length consumed) tuple
320        raise NotImplementedError
321
322    def decode(self, input, final=False):
323        # decode input (taking the buffer into account)
324        data = self.buffer + input
325        (result, consumed) = self._buffer_decode(data, self.errors, final)
326        # keep undecoded input until the next call
327        self.buffer = data[consumed:]
328        return result
329
330    def reset(self):
331        IncrementalDecoder.reset(self)
332        self.buffer = b""
333
334    def getstate(self):
335        # additional state info is always 0
336        return (self.buffer, 0)
337
338    def setstate(self, state):
339        # ignore additional state info
340        self.buffer = state[0]
341
342#
343# The StreamWriter and StreamReader class provide generic working
344# interfaces which can be used to implement new encoding submodules
345# very easily. See encodings/utf_8.py for an example on how this is
346# done.
347#
348
349class StreamWriter(Codec):
350
351    def __init__(self, stream, errors='strict'):
352
353        """ Creates a StreamWriter instance.
354
355            stream must be a file-like object open for writing.
356
357            The StreamWriter may use different error handling
358            schemes by providing the errors keyword argument. These
359            parameters are predefined:
360
361             'strict' - raise a ValueError (or a subclass)
362             'ignore' - ignore the character and continue with the next
363             'replace'- replace with a suitable replacement character
364             'xmlcharrefreplace' - Replace with the appropriate XML
365                                   character reference.
366             'backslashreplace'  - Replace with backslashed escape
367                                   sequences.
368             'namereplace'       - Replace with \\N{...} escape sequences.
369
370            The set of allowed parameter values can be extended via
371            register_error.
372        """
373        self.stream = stream
374        self.errors = errors
375
376    def write(self, object):
377
378        """ Writes the object's contents encoded to self.stream.
379        """
380        data, consumed = self.encode(object, self.errors)
381        self.stream.write(data)
382
383    def writelines(self, list):
384
385        """ Writes the concatenated list of strings to the stream
386            using .write().
387        """
388        self.write(''.join(list))
389
390    def reset(self):
391
392        """ Resets the codec buffers used for keeping internal state.
393
394            Calling this method should ensure that the data on the
395            output is put into a clean state, that allows appending
396            of new fresh data without having to rescan the whole
397            stream to recover state.
398
399        """
400        pass
401
402    def seek(self, offset, whence=0):
403        self.stream.seek(offset, whence)
404        if whence == 0 and offset == 0:
405            self.reset()
406
407    def __getattr__(self, name,
408                    getattr=getattr):
409
410        """ Inherit all other methods from the underlying stream.
411        """
412        return getattr(self.stream, name)
413
414    def __enter__(self):
415        return self
416
417    def __exit__(self, type, value, tb):
418        self.stream.close()
419
420    def __reduce_ex__(self, proto):
421        raise TypeError("can't serialize %s" % self.__class__.__name__)
422
423###
424
425class StreamReader(Codec):
426
427    charbuffertype = str
428
429    def __init__(self, stream, errors='strict'):
430
431        """ Creates a StreamReader instance.
432
433            stream must be a file-like object open for reading.
434
435            The StreamReader may use different error handling
436            schemes by providing the errors keyword argument. These
437            parameters are predefined:
438
439             'strict' - raise a ValueError (or a subclass)
440             'ignore' - ignore the character and continue with the next
441             'replace'- replace with a suitable replacement character
442             'backslashreplace' - Replace with backslashed escape sequences;
443
444            The set of allowed parameter values can be extended via
445            register_error.
446        """
447        self.stream = stream
448        self.errors = errors
449        self.bytebuffer = b""
450        self._empty_charbuffer = self.charbuffertype()
451        self.charbuffer = self._empty_charbuffer
452        self.linebuffer = None
453
454    def decode(self, input, errors='strict'):
455        raise NotImplementedError
456
457    def read(self, size=-1, chars=-1, firstline=False):
458
459        """ Decodes data from the stream self.stream and returns the
460            resulting object.
461
462            chars indicates the number of decoded code points or bytes to
463            return. read() will never return more data than requested,
464            but it might return less, if there is not enough available.
465
466            size indicates the approximate maximum number of decoded
467            bytes or code points to read for decoding. The decoder
468            can modify this setting as appropriate. The default value
469            -1 indicates to read and decode as much as possible.  size
470            is intended to prevent having to decode huge files in one
471            step.
472
473            If firstline is true, and a UnicodeDecodeError happens
474            after the first line terminator in the input only the first line
475            will be returned, the rest of the input will be kept until the
476            next call to read().
477
478            The method should use a greedy read strategy, meaning that
479            it should read as much data as is allowed within the
480            definition of the encoding and the given size, e.g.  if
481            optional encoding endings or state markers are available
482            on the stream, these should be read too.
483        """
484        # If we have lines cached, first merge them back into characters
485        if self.linebuffer:
486            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
487            self.linebuffer = None
488
489        if chars < 0:
490            # For compatibility with other read() methods that take a
491            # single argument
492            chars = size
493
494        # read until we get the required number of characters (if available)
495        while True:
496            # can the request be satisfied from the character buffer?
497            if chars >= 0:
498                if len(self.charbuffer) >= chars:
499                    break
500            # we need more data
501            if size < 0:
502                newdata = self.stream.read()
503            else:
504                newdata = self.stream.read(size)
505            # decode bytes (those remaining from the last call included)
506            data = self.bytebuffer + newdata
507            if not data:
508                break
509            try:
510                newchars, decodedbytes = self.decode(data, self.errors)
511            except UnicodeDecodeError as exc:
512                if firstline:
513                    newchars, decodedbytes = \
514                        self.decode(data[:exc.start], self.errors)
515                    lines = newchars.splitlines(keepends=True)
516                    if len(lines)<=1:
517                        raise
518                else:
519                    raise
520            # keep undecoded bytes until the next call
521            self.bytebuffer = data[decodedbytes:]
522            # put new characters in the character buffer
523            self.charbuffer += newchars
524            # there was no data available
525            if not newdata:
526                break
527        if chars < 0:
528            # Return everything we've got
529            result = self.charbuffer
530            self.charbuffer = self._empty_charbuffer
531        else:
532            # Return the first chars characters
533            result = self.charbuffer[:chars]
534            self.charbuffer = self.charbuffer[chars:]
535        return result
536
537    def readline(self, size=None, keepends=True):
538
539        """ Read one line from the input stream and return the
540            decoded data.
541
542            size, if given, is passed as size argument to the
543            read() method.
544
545        """
546        # If we have lines cached from an earlier read, return
547        # them unconditionally
548        if self.linebuffer:
549            line = self.linebuffer[0]
550            del self.linebuffer[0]
551            if len(self.linebuffer) == 1:
552                # revert to charbuffer mode; we might need more data
553                # next time
554                self.charbuffer = self.linebuffer[0]
555                self.linebuffer = None
556            if not keepends:
557                line = line.splitlines(keepends=False)[0]
558            return line
559
560        readsize = size or 72
561        line = self._empty_charbuffer
562        # If size is given, we call read() only once
563        while True:
564            data = self.read(readsize, firstline=True)
565            if data:
566                # If we're at a "\r" read one extra character (which might
567                # be a "\n") to get a proper line ending. If the stream is
568                # temporarily exhausted we return the wrong line ending.
569                if (isinstance(data, str) and data.endswith("\r")) or \
570                   (isinstance(data, bytes) and data.endswith(b"\r")):
571                    data += self.read(size=1, chars=1)
572
573            line += data
574            lines = line.splitlines(keepends=True)
575            if lines:
576                if len(lines) > 1:
577                    # More than one line result; the first line is a full line
578                    # to return
579                    line = lines[0]
580                    del lines[0]
581                    if len(lines) > 1:
582                        # cache the remaining lines
583                        lines[-1] += self.charbuffer
584                        self.linebuffer = lines
585                        self.charbuffer = None
586                    else:
587                        # only one remaining line, put it back into charbuffer
588                        self.charbuffer = lines[0] + self.charbuffer
589                    if not keepends:
590                        line = line.splitlines(keepends=False)[0]
591                    break
592                line0withend = lines[0]
593                line0withoutend = lines[0].splitlines(keepends=False)[0]
594                if line0withend != line0withoutend: # We really have a line end
595                    # Put the rest back together and keep it until the next call
596                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
597                                      self.charbuffer
598                    if keepends:
599                        line = line0withend
600                    else:
601                        line = line0withoutend
602                    break
603            # we didn't get anything or this was our only try
604            if not data or size is not None:
605                if line and not keepends:
606                    line = line.splitlines(keepends=False)[0]
607                break
608            if readsize < 8000:
609                readsize *= 2
610        return line
611
612    def readlines(self, sizehint=None, keepends=True):
613
614        """ Read all lines available on the input stream
615            and return them as a list.
616
617            Line breaks are implemented using the codec's decoder
618            method and are included in the list entries.
619
620            sizehint, if given, is ignored since there is no efficient
621            way to finding the true end-of-line.
622
623        """
624        data = self.read()
625        return data.splitlines(keepends)
626
627    def reset(self):
628
629        """ Resets the codec buffers used for keeping internal state.
630
631            Note that no stream repositioning should take place.
632            This method is primarily intended to be able to recover
633            from decoding errors.
634
635        """
636        self.bytebuffer = b""
637        self.charbuffer = self._empty_charbuffer
638        self.linebuffer = None
639
640    def seek(self, offset, whence=0):
641        """ Set the input stream's current position.
642
643            Resets the codec buffers used for keeping state.
644        """
645        self.stream.seek(offset, whence)
646        self.reset()
647
648    def __next__(self):
649
650        """ Return the next decoded line from the input stream."""
651        line = self.readline()
652        if line:
653            return line
654        raise StopIteration
655
656    def __iter__(self):
657        return self
658
659    def __getattr__(self, name,
660                    getattr=getattr):
661
662        """ Inherit all other methods from the underlying stream.
663        """
664        return getattr(self.stream, name)
665
666    def __enter__(self):
667        return self
668
669    def __exit__(self, type, value, tb):
670        self.stream.close()
671
672    def __reduce_ex__(self, proto):
673        raise TypeError("can't serialize %s" % self.__class__.__name__)
674
675###
676
677class StreamReaderWriter:
678
679    """ StreamReaderWriter instances allow wrapping streams which
680        work in both read and write modes.
681
682        The design is such that one can use the factory functions
683        returned by the codec.lookup() function to construct the
684        instance.
685
686    """
687    # Optional attributes set by the file wrappers below
688    encoding = 'unknown'
689
690    def __init__(self, stream, Reader, Writer, errors='strict'):
691
692        """ Creates a StreamReaderWriter instance.
693
694            stream must be a Stream-like object.
695
696            Reader, Writer must be factory functions or classes
697            providing the StreamReader, StreamWriter interface resp.
698
699            Error handling is done in the same way as defined for the
700            StreamWriter/Readers.
701
702        """
703        self.stream = stream
704        self.reader = Reader(stream, errors)
705        self.writer = Writer(stream, errors)
706        self.errors = errors
707
708    def read(self, size=-1):
709
710        return self.reader.read(size)
711
712    def readline(self, size=None):
713
714        return self.reader.readline(size)
715
716    def readlines(self, sizehint=None):
717
718        return self.reader.readlines(sizehint)
719
720    def __next__(self):
721
722        """ Return the next decoded line from the input stream."""
723        return next(self.reader)
724
725    def __iter__(self):
726        return self
727
728    def write(self, data):
729
730        return self.writer.write(data)
731
732    def writelines(self, list):
733
734        return self.writer.writelines(list)
735
736    def reset(self):
737
738        self.reader.reset()
739        self.writer.reset()
740
741    def seek(self, offset, whence=0):
742        self.stream.seek(offset, whence)
743        self.reader.reset()
744        if whence == 0 and offset == 0:
745            self.writer.reset()
746
747    def __getattr__(self, name,
748                    getattr=getattr):
749
750        """ Inherit all other methods from the underlying stream.
751        """
752        return getattr(self.stream, name)
753
754    # these are needed to make "with StreamReaderWriter(...)" work properly
755
756    def __enter__(self):
757        return self
758
759    def __exit__(self, type, value, tb):
760        self.stream.close()
761
762    def __reduce_ex__(self, proto):
763        raise TypeError("can't serialize %s" % self.__class__.__name__)
764
765###
766
767class StreamRecoder:
768
769    """ StreamRecoder instances translate data from one encoding to another.
770
771        They use the complete set of APIs returned by the
772        codecs.lookup() function to implement their task.
773
774        Data written to the StreamRecoder is first decoded into an
775        intermediate format (depending on the "decode" codec) and then
776        written to the underlying stream using an instance of the provided
777        Writer class.
778
779        In the other direction, data is read from the underlying stream using
780        a Reader instance and then encoded and returned to the caller.
781
782    """
783    # Optional attributes set by the file wrappers below
784    data_encoding = 'unknown'
785    file_encoding = 'unknown'
786
787    def __init__(self, stream, encode, decode, Reader, Writer,
788                 errors='strict'):
789
790        """ Creates a StreamRecoder instance which implements a two-way
791            conversion: encode and decode work on the frontend (the
792            data visible to .read() and .write()) while Reader and Writer
793            work on the backend (the data in stream).
794
795            You can use these objects to do transparent
796            transcodings from e.g. latin-1 to utf-8 and back.
797
798            stream must be a file-like object.
799
800            encode and decode must adhere to the Codec interface; Reader and
801            Writer must be factory functions or classes providing the
802            StreamReader and StreamWriter interfaces resp.
803
804            Error handling is done in the same way as defined for the
805            StreamWriter/Readers.
806
807        """
808        self.stream = stream
809        self.encode = encode
810        self.decode = decode
811        self.reader = Reader(stream, errors)
812        self.writer = Writer(stream, errors)
813        self.errors = errors
814
815    def read(self, size=-1):
816
817        data = self.reader.read(size)
818        data, bytesencoded = self.encode(data, self.errors)
819        return data
820
821    def readline(self, size=None):
822
823        if size is None:
824            data = self.reader.readline()
825        else:
826            data = self.reader.readline(size)
827        data, bytesencoded = self.encode(data, self.errors)
828        return data
829
830    def readlines(self, sizehint=None):
831
832        data = self.reader.read()
833        data, bytesencoded = self.encode(data, self.errors)
834        return data.splitlines(keepends=True)
835
836    def __next__(self):
837
838        """ Return the next decoded line from the input stream."""
839        data = next(self.reader)
840        data, bytesencoded = self.encode(data, self.errors)
841        return data
842
843    def __iter__(self):
844        return self
845
846    def write(self, data):
847
848        data, bytesdecoded = self.decode(data, self.errors)
849        return self.writer.write(data)
850
851    def writelines(self, list):
852
853        data = b''.join(list)
854        data, bytesdecoded = self.decode(data, self.errors)
855        return self.writer.write(data)
856
857    def reset(self):
858
859        self.reader.reset()
860        self.writer.reset()
861
862    def seek(self, offset, whence=0):
863        # Seeks must be propagated to both the readers and writers
864        # as they might need to reset their internal buffers.
865        self.reader.seek(offset, whence)
866        self.writer.seek(offset, whence)
867
868    def __getattr__(self, name,
869                    getattr=getattr):
870
871        """ Inherit all other methods from the underlying stream.
872        """
873        return getattr(self.stream, name)
874
875    def __enter__(self):
876        return self
877
878    def __exit__(self, type, value, tb):
879        self.stream.close()
880
881    def __reduce_ex__(self, proto):
882        raise TypeError("can't serialize %s" % self.__class__.__name__)
883
884### Shortcuts
885
886def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
887
888    """ Open an encoded file using the given mode and return
889        a wrapped version providing transparent encoding/decoding.
890
891        Note: The wrapped version will only accept the object format
892        defined by the codecs, i.e. Unicode objects for most builtin
893        codecs. Output is also codec dependent and will usually be
894        Unicode as well.
895
896        If encoding is not None, then the
897        underlying encoded files are always opened in binary mode.
898        The default file mode is 'r', meaning to open the file in read mode.
899
900        encoding specifies the encoding which is to be used for the
901        file.
902
903        errors may be given to define the error handling. It defaults
904        to 'strict' which causes ValueErrors to be raised in case an
905        encoding error occurs.
906
907        buffering has the same meaning as for the builtin open() API.
908        It defaults to -1 which means that the default buffer size will
909        be used.
910
911        The returned wrapped file object provides an extra attribute
912        .encoding which allows querying the used encoding. This
913        attribute is only available if an encoding was specified as
914        parameter.
915
916    """
917    if encoding is not None and \
918       'b' not in mode:
919        # Force opening of the file in binary mode
920        mode = mode + 'b'
921    file = builtins.open(filename, mode, buffering)
922    if encoding is None:
923        return file
924
925    try:
926        info = lookup(encoding)
927        srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
928        # Add attributes to simplify introspection
929        srw.encoding = encoding
930        return srw
931    except:
932        file.close()
933        raise
934
935def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
936
937    """ Return a wrapped version of file which provides transparent
938        encoding translation.
939
940        Data written to the wrapped file is decoded according
941        to the given data_encoding and then encoded to the underlying
942        file using file_encoding. The intermediate data type
943        will usually be Unicode but depends on the specified codecs.
944
945        Bytes read from the file are decoded using file_encoding and then
946        passed back to the caller encoded using data_encoding.
947
948        If file_encoding is not given, it defaults to data_encoding.
949
950        errors may be given to define the error handling. It defaults
951        to 'strict' which causes ValueErrors to be raised in case an
952        encoding error occurs.
953
954        The returned wrapped file object provides two extra attributes
955        .data_encoding and .file_encoding which reflect the given
956        parameters of the same name. The attributes can be used for
957        introspection by Python programs.
958
959    """
960    if file_encoding is None:
961        file_encoding = data_encoding
962    data_info = lookup(data_encoding)
963    file_info = lookup(file_encoding)
964    sr = StreamRecoder(file, data_info.encode, data_info.decode,
965                       file_info.streamreader, file_info.streamwriter, errors)
966    # Add attributes to simplify introspection
967    sr.data_encoding = data_encoding
968    sr.file_encoding = file_encoding
969    return sr
970
971### Helpers for codec lookup
972
973def getencoder(encoding):
974
975    """ Lookup up the codec for the given encoding and return
976        its encoder function.
977
978        Raises a LookupError in case the encoding cannot be found.
979
980    """
981    return lookup(encoding).encode
982
983def getdecoder(encoding):
984
985    """ Lookup up the codec for the given encoding and return
986        its decoder function.
987
988        Raises a LookupError in case the encoding cannot be found.
989
990    """
991    return lookup(encoding).decode
992
993def getincrementalencoder(encoding):
994
995    """ Lookup up the codec for the given encoding and return
996        its IncrementalEncoder class or factory function.
997
998        Raises a LookupError in case the encoding cannot be found
999        or the codecs doesn't provide an incremental encoder.
1000
1001    """
1002    encoder = lookup(encoding).incrementalencoder
1003    if encoder is None:
1004        raise LookupError(encoding)
1005    return encoder
1006
1007def getincrementaldecoder(encoding):
1008
1009    """ Lookup up the codec for the given encoding and return
1010        its IncrementalDecoder class or factory function.
1011
1012        Raises a LookupError in case the encoding cannot be found
1013        or the codecs doesn't provide an incremental decoder.
1014
1015    """
1016    decoder = lookup(encoding).incrementaldecoder
1017    if decoder is None:
1018        raise LookupError(encoding)
1019    return decoder
1020
1021def getreader(encoding):
1022
1023    """ Lookup up the codec for the given encoding and return
1024        its StreamReader class or factory function.
1025
1026        Raises a LookupError in case the encoding cannot be found.
1027
1028    """
1029    return lookup(encoding).streamreader
1030
1031def getwriter(encoding):
1032
1033    """ Lookup up the codec for the given encoding and return
1034        its StreamWriter class or factory function.
1035
1036        Raises a LookupError in case the encoding cannot be found.
1037
1038    """
1039    return lookup(encoding).streamwriter
1040
1041def iterencode(iterator, encoding, errors='strict', **kwargs):
1042    """
1043    Encoding iterator.
1044
1045    Encodes the input strings from the iterator using an IncrementalEncoder.
1046
1047    errors and kwargs are passed through to the IncrementalEncoder
1048    constructor.
1049    """
1050    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1051    for input in iterator:
1052        output = encoder.encode(input)
1053        if output:
1054            yield output
1055    output = encoder.encode("", True)
1056    if output:
1057        yield output
1058
1059def iterdecode(iterator, encoding, errors='strict', **kwargs):
1060    """
1061    Decoding iterator.
1062
1063    Decodes the input strings from the iterator using an IncrementalDecoder.
1064
1065    errors and kwargs are passed through to the IncrementalDecoder
1066    constructor.
1067    """
1068    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1069    for input in iterator:
1070        output = decoder.decode(input)
1071        if output:
1072            yield output
1073    output = decoder.decode(b"", True)
1074    if output:
1075        yield output
1076
1077### Helpers for charmap-based codecs
1078
1079def make_identity_dict(rng):
1080
1081    """ make_identity_dict(rng) -> dict
1082
1083        Return a dictionary where elements of the rng sequence are
1084        mapped to themselves.
1085
1086    """
1087    return {i:i for i in rng}
1088
1089def make_encoding_map(decoding_map):
1090
1091    """ Creates an encoding map from a decoding map.
1092
1093        If a target mapping in the decoding map occurs multiple
1094        times, then that target is mapped to None (undefined mapping),
1095        causing an exception when encountered by the charmap codec
1096        during translation.
1097
1098        One example where this happens is cp875.py which decodes
1099        multiple character to \\u001a.
1100
1101    """
1102    m = {}
1103    for k,v in decoding_map.items():
1104        if not v in m:
1105            m[v] = k
1106        else:
1107            m[v] = None
1108    return m
1109
1110### error handlers
1111
1112try:
1113    strict_errors = lookup_error("strict")
1114    ignore_errors = lookup_error("ignore")
1115    replace_errors = lookup_error("replace")
1116    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1117    backslashreplace_errors = lookup_error("backslashreplace")
1118    namereplace_errors = lookup_error("namereplace")
1119except LookupError:
1120    # In --disable-unicode builds, these error handler are missing
1121    strict_errors = None
1122    ignore_errors = None
1123    replace_errors = None
1124    xmlcharrefreplace_errors = None
1125    backslashreplace_errors = None
1126    namereplace_errors = None
1127
1128# Tell modulefinder that using codecs probably needs the encodings
1129# package
1130_false = 0
1131if _false:
1132    import encodings
1133